319 files changed, 12342 insertions, 3756 deletions
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 537823020301..a33c01a0e461 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -17,13 +17,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -36,6 +36,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include <algorithm>
 
 #define DEBUG_TYPE "basicaa"
@@ -1283,9 +1284,9 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
         // give up if we can't determine conditions that hold for every cycle:
         const Value *V = DecompGEP1.VarIndices[i].V;
 
-        bool SignKnownZero, SignKnownOne;
-        ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, DL,
-                       0, &AC, nullptr, DT);
+        KnownBits Known = computeKnownBits(V, DL, 0, &AC, nullptr, DT);
+        bool SignKnownZero = Known.isNonNegative();
+        bool SignKnownOne = Known.isNegative();
 
         // Zero-extension widens the variable, and so forces the sign
         // bit to zero.
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 0dc4475ca0e2..db87b17c1567 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -301,6 +301,8 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
       WeightSum += Weights[i];
     }
   }
+  assert(WeightSum <= UINT32_MAX &&
+         "Expected weights to scale down to 32 bits");
 
   if (WeightSum == 0 || ReachableIdxs.size() == 0) {
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
@@ -328,21 +330,14 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
     // the difference between reachable blocks.
     if (ToDistribute > BranchProbability::getZero()) {
       BranchProbability PerEdge = ToDistribute / ReachableIdxs.size();
-      for (auto i : ReachableIdxs) {
+      for (auto i : ReachableIdxs)
         BP[i] += PerEdge;
-        ToDistribute -= PerEdge;
-      }
-      // Tail goes to the first reachable edge.
-      BP[ReachableIdxs[0]] += ToDistribute;
     }
   }
 
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
     setEdgeProbability(BB, i, BP[i]);
 
-  assert(WeightSum <= UINT32_MAX &&
-         "Expected weights to scale down to 32 bits");
-
   return true;
 }
 
diff --git a/lib/Analysis/CallGraph.cpp b/lib/Analysis/CallGraph.cpp
index 6942176ae6ae..ff5242f69a1b 100644
--- a/lib/Analysis/CallGraph.cpp
+++ b/lib/Analysis/CallGraph.cpp
@@ -21,23 +21,18 @@ using namespace llvm;
 //
 
 CallGraph::CallGraph(Module &M)
-    : M(M), Root(nullptr), ExternalCallingNode(getOrInsertFunction(nullptr)),
+    : M(M), ExternalCallingNode(getOrInsertFunction(nullptr)),
       CallsExternalNode(llvm::make_unique<CallGraphNode>(nullptr)) {
   // Add every function to the call graph.
   for (Function &F : M)
     addToCallGraph(&F);
-
-  // If we didn't find a main function, use the external call graph node
-  if (!Root)
-    Root = ExternalCallingNode;
 }
 
 CallGraph::CallGraph(CallGraph &&Arg)
-    : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)), Root(Arg.Root),
+    : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)),
       ExternalCallingNode(Arg.ExternalCallingNode),
       CallsExternalNode(std::move(Arg.CallsExternalNode)) {
   Arg.FunctionMap.clear();
-  Arg.Root = nullptr;
   Arg.ExternalCallingNode = nullptr;
 }
 
@@ -57,21 +52,9 @@ CallGraph::~CallGraph() {
 void CallGraph::addToCallGraph(Function *F) {
   CallGraphNode *Node = getOrInsertFunction(F);
 
-  // If this function has external linkage, anything could call it.
-  if (!F->hasLocalLinkage()) {
-    ExternalCallingNode->addCalledFunction(CallSite(), Node);
-
-    // Found the entry point?
-    if (F->getName() == "main") {
-      if (Root) // Found multiple external mains?  Don't pick one.
-        Root = ExternalCallingNode;
-      else
-        Root = Node; // Found a main, keep track of it!
-    }
-  }
-
-  // If this function has its address taken, anything could call it.
-  if (F->hasAddressTaken())
+  // If this function has external linkage or has its address taken, anything
+  // could call it.
+  if (!F->hasLocalLinkage() || F->hasAddressTaken())
     ExternalCallingNode->addCalledFunction(CallSite(), Node);
 
   // If this function is not defined in this translation unit, it could call
@@ -96,13 +79,6 @@ void CallGraph::addToCallGraph(Function *F) {
 }
 
 void CallGraph::print(raw_ostream &OS) const {
-  OS << "CallGraph Root is: ";
-  if (Function *F = Root->getFunction())
-    OS << F->getName() << "\n";
-  else {
-    OS << "<<null function: 0x" << Root << ">>\n";
-  }
-
   // Print in a deterministic order by sorting CallGraphNodes by name.  We do
   // this here to avoid slowing down the non-printing fast path.
 
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 130e917e49d7..0ca712bbfe70 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1438,6 +1438,36 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
            Name == "sinf" || Name == "sinhf" || Name == "sqrtf";
   case 't':
     return Name == "tan" || Name == "tanh" || Name == "tanf" || Name == "tanhf";
+  case '_':
+
+    // Check for various function names that get used for the math functions
+    // when the header files are preprocessed with the macro
+    // __FINITE_MATH_ONLY__ enabled.
+    // The '12' here is the length of the shortest name that can match.
+    // We need to check the size before looking at Name[1] and Name[2]
+    // so we may as well check a limit that will eliminate mismatches.
+    if (Name.size() < 12 || Name[1] != '_')
+      return false;
+    switch (Name[2]) {
+    default:
+      return false;
+    case 'a':
+      return Name == "__acos_finite" || Name == "__acosf_finite" ||
+             Name == "__asin_finite" || Name == "__asinf_finite" ||
+             Name == "__atan2_finite" || Name == "__atan2f_finite";
+    case 'c':
+      return Name == "__cosh_finite" || Name == "__coshf_finite";
+    case 'e':
+      return Name == "__exp_finite" || Name == "__expf_finite" ||
+             Name == "__exp2_finite" || Name == "__exp2f_finite";
+    case 'l':
+      return Name == "__log_finite" || Name == "__logf_finite" ||
+             Name == "__log10_finite" || Name == "__log10f_finite";
+    case 'p':
+      return Name == "__pow_finite" || Name == "__powf_finite";
+    case 's':
+      return Name == "__sinh_finite" || Name == "__sinhf_finite";
+    }
   }
 }
 
@@ -1637,13 +1667,21 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
       if (!TLI)
         return nullptr;
 
-      switch (Name[0]) {
+      char NameKeyChar = Name[0];
+      if (Name[0] == '_' && Name.size() > 2 && Name[1] == '_')
+        NameKeyChar = Name[2];
+
+      switch (NameKeyChar) {
       case 'a':
         if ((Name == "acos" && TLI->has(LibFunc_acos)) ||
-            (Name == "acosf" && TLI->has(LibFunc_acosf)))
+            (Name == "acosf" && TLI->has(LibFunc_acosf)) ||
+            (Name == "__acos_finite" && TLI->has(LibFunc_acos_finite)) ||
+            (Name == "__acosf_finite" && TLI->has(LibFunc_acosf_finite)))
           return ConstantFoldFP(acos, V, Ty);
         else if ((Name == "asin" && TLI->has(LibFunc_asin)) ||
-                 (Name == "asinf" && TLI->has(LibFunc_asinf)))
+                 (Name == "asinf" && TLI->has(LibFunc_asinf)) ||
+                 (Name == "__asin_finite" && TLI->has(LibFunc_asin_finite)) ||
+                 (Name == "__asinf_finite" && TLI->has(LibFunc_asinf_finite)))
           return ConstantFoldFP(asin, V, Ty);
         else if ((Name == "atan" && TLI->has(LibFunc_atan)) ||
                  (Name == "atanf" && TLI->has(LibFunc_atanf)))
@@ -1657,15 +1695,21 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
                  (Name == "cosf" && TLI->has(LibFunc_cosf)))
           return ConstantFoldFP(cos, V, Ty);
         else if ((Name == "cosh" && TLI->has(LibFunc_cosh)) ||
-                 (Name == "coshf" && TLI->has(LibFunc_coshf)))
+                 (Name == "coshf" && TLI->has(LibFunc_coshf)) ||
+                 (Name == "__cosh_finite" && TLI->has(LibFunc_cosh_finite)) ||
+                 (Name == "__coshf_finite" && TLI->has(LibFunc_coshf_finite)))
           return ConstantFoldFP(cosh, V, Ty);
         break;
       case 'e':
         if ((Name == "exp" && TLI->has(LibFunc_exp)) ||
-            (Name == "expf" && TLI->has(LibFunc_expf)))
+            (Name == "expf" && TLI->has(LibFunc_expf)) ||
+            (Name == "__exp_finite" && TLI->has(LibFunc_exp_finite)) ||
+            (Name == "__expf_finite" && TLI->has(LibFunc_expf_finite)))
           return ConstantFoldFP(exp, V, Ty);
         if ((Name == "exp2" && TLI->has(LibFunc_exp2)) ||
-            (Name == "exp2f" && TLI->has(LibFunc_exp2f)))
+            (Name == "exp2f" && TLI->has(LibFunc_exp2f)) ||
+            (Name == "__exp2_finite" && TLI->has(LibFunc_exp2_finite)) ||
+            (Name == "__exp2f_finite" && TLI->has(LibFunc_exp2f_finite)))
           // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
           // C99 library.
           return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
@@ -1680,10 +1724,18 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
         break;
       case 'l':
         if ((Name == "log" && V > 0 && TLI->has(LibFunc_log)) ||
-            (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)))
+            (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)) ||
+            (Name == "__log_finite" && V > 0 &&
+              TLI->has(LibFunc_log_finite)) ||
+            (Name == "__logf_finite" && V > 0 &&
+              TLI->has(LibFunc_logf_finite)))
           return ConstantFoldFP(log, V, Ty);
         else if ((Name == "log10" && V > 0 && TLI->has(LibFunc_log10)) ||
-                 (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)))
+                 (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)) ||
+                 (Name == "__log10_finite" && V > 0 &&
+                   TLI->has(LibFunc_log10_finite)) ||
+                 (Name == "__log10f_finite" && V > 0 &&
+                   TLI->has(LibFunc_log10f_finite)))
           return ConstantFoldFP(log10, V, Ty);
         break;
       case 'r':
@@ -1695,7 +1747,9 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
             (Name == "sinf" && TLI->has(LibFunc_sinf)))
           return ConstantFoldFP(sin, V, Ty);
         else if ((Name == "sinh" && TLI->has(LibFunc_sinh)) ||
-                 (Name == "sinhf" && TLI->has(LibFunc_sinhf)))
+                 (Name == "sinhf" && TLI->has(LibFunc_sinhf)) ||
+                 (Name == "__sinh_finite" && TLI->has(LibFunc_sinh_finite)) ||
+                 (Name == "__sinhf_finite" && TLI->has(LibFunc_sinhf_finite)))
           return ConstantFoldFP(sinh, V, Ty);
         else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc_sqrt)) ||
                  (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc_sqrtf)))
@@ -1813,13 +1867,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
         if (!TLI)
           return nullptr;
         if ((Name == "pow" && TLI->has(LibFunc_pow)) ||
-            (Name == "powf" && TLI->has(LibFunc_powf)))
+            (Name == "powf" && TLI->has(LibFunc_powf)) ||
+            (Name == "__pow_finite" && TLI->has(LibFunc_pow_finite)) ||
+            (Name == "__powf_finite" && TLI->has(LibFunc_powf_finite)))
           return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
         if ((Name == "fmod" && TLI->has(LibFunc_fmod)) ||
             (Name == "fmodf" && TLI->has(LibFunc_fmodf)))
           return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty);
         if ((Name == "atan2" && TLI->has(LibFunc_atan2)) ||
-            (Name == "atan2f" && TLI->has(LibFunc_atan2f)))
+            (Name == "atan2f" && TLI->has(LibFunc_atan2f)) ||
+            (Name == "__atan2_finite" && TLI->has(LibFunc_atan2_finite)) ||
+            (Name == "__atan2f_finite" && TLI->has(LibFunc_atan2f_finite)))
           return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
       } else if (auto *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
         if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy())
diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp
index 9f5dc5318239..8f808f3e7871 100644
--- a/lib/Analysis/DemandedBits.cpp
+++ b/lib/Analysis/DemandedBits.cpp
@@ -86,13 +86,11 @@ void DemandedBits::determineLiveOperandBits(
       [&](unsigned BitWidth, const Value *V1, const Value *V2) {
         const DataLayout &DL = I->getModule()->getDataLayout();
         Known = KnownBits(BitWidth);
-        computeKnownBits(const_cast<Value *>(V1), Known, DL, 0,
-                         &AC, UserI, &DT);
+        computeKnownBits(V1, Known, DL, 0, &AC, UserI, &DT);
 
         if (V2) {
           Known2 = KnownBits(BitWidth);
-          computeKnownBits(const_cast<Value *>(V2), Known2, DL,
-                           0, &AC, UserI, &DT);
+          computeKnownBits(V2, Known2, DL, 0, &AC, UserI, &DT);
         }
       };
 
@@ -118,7 +116,7 @@ void DemandedBits::determineLiveOperandBits(
           // known to be one.
           ComputeKnownBits(BitWidth, I, nullptr);
           AB = APInt::getHighBitsSet(BitWidth,
-                 std::min(BitWidth, Known.One.countLeadingZeros()+1));
+                 std::min(BitWidth, Known.countMaxLeadingZeros()+1));
         }
         break;
       case Intrinsic::cttz:
@@ -128,7 +126,7 @@ void DemandedBits::determineLiveOperandBits(
           // known to be one.
           ComputeKnownBits(BitWidth, I, nullptr);
           AB = APInt::getLowBitsSet(BitWidth,
-                 std::min(BitWidth, Known.One.countTrailingZeros()+1));
+                 std::min(BitWidth, Known.countMaxTrailingZeros()+1));
         }
         break;
       }
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 100a591e452c..44c14cb17c22 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -63,7 +63,7 @@ static cl::opt<bool>
 // PGO before we actually hook up inliner with analysis passes such as BPI and
 // BFI.
 static cl::opt<int> ColdThreshold(
-    "inlinecold-threshold", cl::Hidden, cl::init(225),
+    "inlinecold-threshold", cl::Hidden, cl::init(45),
     cl::desc("Threshold for inlining functions with cold attribute"));
 
 static cl::opt<int>
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 4a713f441ce8..5728887cc1e9 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -1317,7 +1317,7 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
   // If all valid bits in the shift amount are known zero, the first operand is
   // unchanged.
   unsigned NumValidShiftBits = Log2_32_Ceil(BitWidth);
-  if (Known.Zero.countTrailingOnes() >= NumValidShiftBits)
+  if (Known.countMinTrailingZeros() >= NumValidShiftBits)
     return Op0;
 
   return nullptr;
@@ -1536,7 +1536,7 @@ static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1,
   auto Range0 = ConstantRange::makeExactICmpRegion(Cmp0->getPredicate(), *C0);
   auto Range1 = ConstantRange::makeExactICmpRegion(Cmp1->getPredicate(), *C1);
 
-  // For and-of-comapares, check if the intersection is empty:
+  // For and-of-compares, check if the intersection is empty:
   // (icmp X, C0) && (icmp X, C1) --> empty set --> false
   if (IsAnd && Range0.intersectWith(Range1).isEmptySet())
     return getFalse(Cmp0->getType());
@@ -1870,6 +1870,24 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
        match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
     return Op0;
 
+  // (A & B) | (~A ^ B) -> (~A ^ B)
+  // (B & A) | (~A ^ B) -> (~A ^ B)
+  // (A & B) | (B ^ ~A) -> (B ^ ~A)
+  // (B & A) | (B ^ ~A) -> (B ^ ~A)
+  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+      (match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
+       match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
+    return Op1;
+
+  // (~A ^ B) | (A & B) -> (~A ^ B)
+  // (~A ^ B) | (B & A) -> (~A ^ B)
+  // (B ^ ~A) | (A & B) -> (B ^ ~A)
+  // (B ^ ~A) | (B & A) -> (B ^ ~A)
+  if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
+      (match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
+       match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
+    return Op0;
+
   if (Value *V = simplifyAndOrOfICmps(Op0, Op1, false))
     return V;
 
@@ -2286,7 +2304,6 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
     return nullptr;
 
   Type *ITy = GetCompareTy(LHS); // The return type.
-  bool LHSKnownNonNegative, LHSKnownNegative;
   switch (Pred) {
   default:
     llvm_unreachable("Unknown ICmp predicate!");
@@ -2304,39 +2321,41 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
     if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getTrue(ITy);
     break;
-  case ICmpInst::ICMP_SLT:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  case ICmpInst::ICMP_SLT: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getTrue(ITy);
-    if (LHSKnownNonNegative)
+    if (LHSKnown.isNonNegative())
       return getFalse(ITy);
     break;
-  case ICmpInst::ICMP_SLE:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  }
+  case ICmpInst::ICMP_SLE: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getTrue(ITy);
-    if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+    if (LHSKnown.isNonNegative() &&
+        isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getFalse(ITy);
     break;
-  case ICmpInst::ICMP_SGE:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  }
+  case ICmpInst::ICMP_SGE: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getFalse(ITy);
-    if (LHSKnownNonNegative)
+    if (LHSKnown.isNonNegative())
       return getTrue(ITy);
     break;
-  case ICmpInst::ICMP_SGT:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  }
+  case ICmpInst::ICMP_SGT: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getFalse(ITy);
-    if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+    if (LHSKnown.isNonNegative() &&
+        isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getTrue(ITy);
     break;
   }
+  }
 
   return nullptr;
 }
@@ -2535,6 +2554,9 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
+/// TODO: A large part of this logic is duplicated in InstCombine's
+/// foldICmpBinOp(). We should be able to share that and avoid the code
+/// duplication.
 static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
                                     Value *RHS, const SimplifyQuery &Q,
                                     unsigned MaxRecurse) {
@@ -2616,15 +2638,11 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
         return getTrue(ITy);
 
       if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) {
-        bool RHSKnownNonNegative, RHSKnownNegative;
-        bool YKnownNonNegative, YKnownNegative;
-        ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, Q.DL, 0,
-                       Q.AC, Q.CxtI, Q.DT);
-        ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
-                       Q.CxtI, Q.DT);
-        if (RHSKnownNonNegative && YKnownNegative)
+        KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        if (RHSKnown.isNonNegative() && YKnown.isNegative())
           return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy);
-        if (RHSKnownNegative || YKnownNonNegative)
+        if (RHSKnown.isNegative() || YKnown.isNonNegative())
           return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy);
       }
     }
@@ -2636,15 +2654,11 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
         return getFalse(ITy);
 
       if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) {
-        bool LHSKnownNonNegative, LHSKnownNegative;
-        bool YKnownNonNegative, YKnownNegative;
-        ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0,
-                       Q.AC, Q.CxtI, Q.DT);
-        ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
-                       Q.CxtI, Q.DT);
-        if (LHSKnownNonNegative && YKnownNegative)
+        KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        if (LHSKnown.isNonNegative() && YKnown.isNegative())
           return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy);
-        if (LHSKnownNegative || YKnownNonNegative)
+        if (LHSKnown.isNegative() || YKnown.isNonNegative())
           return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy);
       }
     }
@@ -2691,28 +2705,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
 
   // icmp pred (urem X, Y), Y
   if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
-    bool KnownNonNegative, KnownNegative;
     switch (Pred) {
     default:
       break;
     case ICmpInst::ICMP_SGT:
-    case ICmpInst::ICMP_SGE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SGE: {
+      KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
       return getFalse(ITy);
     case ICmpInst::ICMP_SLT:
-    case ICmpInst::ICMP_SLE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SLE: {
+      KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
@@ -2722,28 +2735,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
 
   // icmp pred X, (urem Y, X)
   if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
-    bool KnownNonNegative, KnownNegative;
     switch (Pred) {
     default:
       break;
     case ICmpInst::ICMP_SGT:
-    case ICmpInst::ICMP_SGE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SGE: {
+      KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
       return getTrue(ITy);
     case ICmpInst::ICMP_SLT:
-    case ICmpInst::ICMP_SLE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SLE: {
+      KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
@@ -2815,10 +2827,19 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
       break;
     case Instruction::UDiv:
     case Instruction::LShr:
-      if (ICmpInst::isSigned(Pred))
+      if (ICmpInst::isSigned(Pred) || !LBO->isExact() || !RBO->isExact())
         break;
-      LLVM_FALLTHROUGH;
+      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+                                      RBO->getOperand(0), Q, MaxRecurse - 1))
+          return V;
+      break;
     case Instruction::SDiv:
+      if (!ICmpInst::isEquality(Pred) || !LBO->isExact() || !RBO->isExact())
+        break;
+      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+                                      RBO->getOperand(0), Q, MaxRecurse - 1))
+        return V;
+      break;
     case Instruction::AShr:
       if (!LBO->isExact() || !RBO->isExact())
         break;
@@ -4034,24 +4055,21 @@ Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
 /// match a root vector source operand that contains that element in the same
 /// vector lane (ie, the same mask index), so we can eliminate the shuffle(s).
 static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
-                                   Constant *Mask, Value *RootVec, int RootElt,
+                                   int MaskVal, Value *RootVec,
                                    unsigned MaxRecurse) {
   if (!MaxRecurse--)
     return nullptr;
 
   // Bail out if any mask value is undefined. That kind of shuffle may be
   // simplified further based on demanded bits or other folds.
-  int MaskVal = ShuffleVectorInst::getMaskValue(Mask, RootElt);
   if (MaskVal == -1)
     return nullptr;
 
   // The mask value chooses which source operand we need to look at next.
-  Value *SourceOp;
   int InVecNumElts = Op0->getType()->getVectorNumElements();
-  if (MaskVal < InVecNumElts) {
-    RootElt = MaskVal;
-    SourceOp = Op0;
-  } else {
+  int RootElt = MaskVal;
+  Value *SourceOp = Op0;
+  if (MaskVal >= InVecNumElts) {
     RootElt = MaskVal - InVecNumElts;
     SourceOp = Op1;
   }
@@ -4061,7 +4079,7 @@ static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
   if (auto *SourceShuf = dyn_cast<ShuffleVectorInst>(SourceOp)) {
     return foldIdentityShuffles(
         DestElt, SourceShuf->getOperand(0), SourceShuf->getOperand(1),
-        SourceShuf->getMask(), RootVec, RootElt, MaxRecurse);
+        SourceShuf->getMaskValue(RootElt), RootVec, MaxRecurse);
   }
 
   // TODO: Look through bitcasts? What if the bitcast changes the vector element
@@ -4126,17 +4144,7 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
   // second one.
   if (Op0Const && !Op1Const) {
     std::swap(Op0, Op1);
-    for (int &Idx : Indices) {
-      if (Idx == -1)
-        continue;
-      Idx = Idx < (int)InVecNumElts ? Idx + InVecNumElts : Idx - InVecNumElts;
-      assert(Idx >= 0 && Idx < (int)InVecNumElts * 2 &&
-             "shufflevector mask index out of range");
-    }
-    Mask = ConstantDataVector::get(
-        Mask->getContext(),
-        makeArrayRef(reinterpret_cast<uint32_t *>(Indices.data()),
-                     MaskNumElts));
+    ShuffleVectorInst::commuteShuffleMask(Indices, InVecNumElts);
   }
 
   // A shuffle of a splat is always the splat itself. Legal if the shuffle's
@@ -4160,7 +4168,8 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
   for (unsigned i = 0; i != MaskNumElts; ++i) {
     // Note that recursion is limited for each vector element, so if any element
     // exceeds the limit, this will fail to simplify.
-    RootVec = foldIdentityShuffles(i, Op0, Op1, Mask, RootVec, i, MaxRecurse);
+    RootVec =
+        foldIdentityShuffles(i, Op0, Op1, Indices[i], RootVec, MaxRecurse);
 
     // We can't replace a widening/narrowing shuffle with one of its operands.
     if (!RootVec || RootVec->getType() != RetTy)
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index 99f900ae3932..26706f5509ba 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -232,7 +232,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
         }
         // We should have named any anonymous globals
         assert(CalledFunction->hasName());
-        auto ScaledCount = ProfileSummaryInfo::getProfileCount(&I, BFI);
+        auto ScaledCount = PSI->getProfileCount(&I, BFI);
         auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI)
                                    : CalleeInfo::HotnessType::Unknown;
 
@@ -330,6 +330,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     const Module &M,
     std::function<BlockFrequencyInfo *(const Function &F)> GetBFICallback,
     ProfileSummaryInfo *PSI) {
+  assert(PSI);
   ModuleSummaryIndex Index;
 
   // Identify the local values in the llvm.used and llvm.compiler.used sets,
diff --git a/lib/Analysis/OptimizationDiagnosticInfo.cpp b/lib/Analysis/OptimizationDiagnosticInfo.cpp
index 73245981b022..e38e530c052d 100644
--- a/lib/Analysis/OptimizationDiagnosticInfo.cpp
+++ b/lib/Analysis/OptimizationDiagnosticInfo.cpp
@@ -101,7 +101,7 @@ void MappingTraits<DiagnosticInfoOptimizationBase *>::mapping(
   // These are read-only for now.
   DiagnosticLocation DL = OptDiag->getLocation();
   StringRef FN =
-      GlobalValue::getRealLinkageName(OptDiag->getFunction().getName());
+      GlobalValue::dropLLVMManglingEscape(OptDiag->getFunction().getName());
 
   StringRef PassName(OptDiag->PassName);
   io.mapRequired("Pass", PassName);
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index 1a53a8ed4283..502f4205b689 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -75,11 +75,14 @@ ProfileSummaryInfo::getProfileCount(const Instruction *Inst,
     return None;
   assert((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
          "We can only get profile count for call/invoke instruction.");
-  // Check if there is a profile metadata on the instruction. If it is present,
-  // determine hotness solely based on that.
-  uint64_t TotalCount;
-  if (Inst->extractProfTotalWeight(TotalCount))
-    return TotalCount;
+  if (computeSummary() && Summary->getKind() == ProfileSummary::PSK_Sample) {
+    // In sample PGO mode, check if there is a profile metadata on the
+    // instruction. If it is present, determine hotness solely based on that,
+    // since the sampled entry count may not be accurate.
+    uint64_t TotalCount;
+    if (Inst->extractProfTotalWeight(TotalCount))
+      return TotalCount;
+  }
   if (BFI)
     return BFI->getBlockProfileCount(Inst->getParent());
   return None;
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 01dca0793145..800354d2f5b4 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -584,7 +584,7 @@ CompareValueComplexity(SmallSet<std::pair<Value *, Value *>, 8> &EqCache,
 static int CompareSCEVComplexity(
     SmallSet<std::pair<const SCEV *, const SCEV *>, 8> &EqCacheSCEV,
     const LoopInfo *const LI, const SCEV *LHS, const SCEV *RHS,
-    unsigned Depth = 0) {
+    DominatorTree &DT, unsigned Depth = 0) {
   // Fast-path: SCEVs are uniqued so we can do a quick equality check.
   if (LHS == RHS)
     return 0;
@@ -629,9 +629,16 @@ static int CompareSCEVComplexity(
     const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
     const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
 
-    // Compare addrec loop depths.
+    // If there is a dominance relationship between the loops, sort by the
+    // dominance. Otherwise, sort by depth. We require such order in getAddExpr.
     const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
     if (LLoop != RLoop) {
+      const BasicBlock *LHead = LLoop->getHeader(), *RHead = RLoop->getHeader();
+      assert(LHead != RHead && "Two loops share the same header?");
+      if (DT.dominates(LHead, RHead))
+        return 1;
+      else if (DT.dominates(RHead, LHead))
+        return -1;
       unsigned LDepth = LLoop->getLoopDepth(), RDepth = RLoop->getLoopDepth();
       if (LDepth != RDepth)
         return (int)LDepth - (int)RDepth;
@@ -645,7 +652,7 @@ static int CompareSCEVComplexity(
     // Lexicographically compare.
     for (unsigned i = 0; i != LNumOps; ++i) {
       int X = CompareSCEVComplexity(EqCacheSCEV, LI, LA->getOperand(i),
-                                    RA->getOperand(i), Depth + 1);
+                                    RA->getOperand(i), DT,  Depth + 1);
       if (X != 0)
         return X;
     }
@@ -669,7 +676,7 @@ static int CompareSCEVComplexity(
       if (i >= RNumOps)
         return 1;
       int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(i),
-                                    RC->getOperand(i), Depth + 1);
+                                    RC->getOperand(i), DT, Depth + 1);
       if (X != 0)
         return X;
     }
@@ -683,10 +690,10 @@ static int CompareSCEVComplexity(
 
     // Lexicographically compare udiv expressions.
     int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getLHS(), RC->getLHS(),
-                                  Depth + 1);
+                                  DT, Depth + 1);
     if (X != 0)
       return X;
-    X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(),
+    X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(), DT,
                               Depth + 1);
     if (X == 0)
       EqCacheSCEV.insert({LHS, RHS});
@@ -701,7 +708,7 @@ static int CompareSCEVComplexity(
 
     // Compare cast expressions by operand.
     int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(),
-                                  RC->getOperand(), Depth + 1);
+                                  RC->getOperand(), DT, Depth + 1);
     if (X == 0)
       EqCacheSCEV.insert({LHS, RHS});
     return X;
@@ -724,7 +731,7 @@ static int CompareSCEVComplexity(
 /// land in memory.
 ///
 static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
-                              LoopInfo *LI) {
+                              LoopInfo *LI, DominatorTree &DT) {
   if (Ops.size() < 2) return;  // Noop
 
   SmallSet<std::pair<const SCEV *, const SCEV *>, 8> EqCache;
@@ -732,15 +739,16 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
     // This is the common case, which also happens to be trivially simple.
     // Special case it.
     const SCEV *&LHS = Ops[0], *&RHS = Ops[1];
-    if (CompareSCEVComplexity(EqCache, LI, RHS, LHS) < 0)
+    if (CompareSCEVComplexity(EqCache, LI, RHS, LHS, DT) < 0)
       std::swap(LHS, RHS);
     return;
   }
 
   // Do the rough sort by complexity.
   std::stable_sort(Ops.begin(), Ops.end(),
-                   [&EqCache, LI](const SCEV *LHS, const SCEV *RHS) {
-                     return CompareSCEVComplexity(EqCache, LI, LHS, RHS) < 0;
+                   [&EqCache, LI, &DT](const SCEV *LHS, const SCEV *RHS) {
+                     return
+                         CompareSCEVComplexity(EqCache, LI, LHS, RHS, DT) < 0;
                    });
 
   // Now that we are sorted by complexity, group elements of the same
@@ -2186,7 +2194,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags);
 
@@ -2492,7 +2500,13 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     // added together.  If so, we can fold them.
     for (unsigned OtherIdx = Idx+1;
          OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
-         ++OtherIdx)
+         ++OtherIdx) {
+      // We expect the AddRecExpr's to be sorted in reverse dominance order,
+      // so that the 1st found AddRecExpr is dominated by all others.
+      assert(DT.dominates(
+           cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()->getHeader(),
+           AddRec->getLoop()->getHeader()) &&
+        "AddRecExprs are not sorted in reverse dominance order?");
       if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
         // Other + {A,+,B}<L> + {C,+,D}<L>  -->  Other + {A+C,+,B+D}<L>
         SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
@@ -2518,6 +2532,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
         Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap);
         return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
       }
+    }
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
     // next one.
@@ -2614,7 +2629,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags);
 
@@ -3211,7 +3226,7 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   // If there are any constants, fold them together.
   unsigned Idx = 0;
@@ -3312,7 +3327,7 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   // If there are any constants, fold them together.
   unsigned Idx = 0;
@@ -4636,7 +4651,7 @@ uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) {
     KnownBits Known(BitWidth);
     computeKnownBits(U->getValue(), Known, getDataLayout(), 0, &AC,
                      nullptr, &DT);
-    return Known.Zero.countTrailingOnes();
+    return Known.countMinTrailingZeros();
   }
 
   // SCEVUDivExpr
@@ -5955,6 +5970,30 @@ bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S,
   return false;
 }
 
+ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E)
+    : ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {}
+
+ScalarEvolution::ExitLimit::ExitLimit(
+    const SCEV *E, const SCEV *M, bool MaxOrZero,
+    ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *> PredSetList)
+    : ExactNotTaken(E), MaxNotTaken(M), MaxOrZero(MaxOrZero) {
+  assert((isa<SCEVCouldNotCompute>(ExactNotTaken) ||
+          !isa<SCEVCouldNotCompute>(MaxNotTaken)) &&
+         "Exact is not allowed to be less precise than Max");
+  for (auto *PredSet : PredSetList)
+    for (auto *P : *PredSet)
+      addPredicate(P);
+}
+
+ScalarEvolution::ExitLimit::ExitLimit(
+    const SCEV *E, const SCEV *M, bool MaxOrZero,
+    const SmallPtrSetImpl<const SCEVPredicate *> &PredSet)
+    : ExitLimit(E, M, MaxOrZero, {&PredSet}) {}
+
+ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E, const SCEV *M,
+                                      bool MaxOrZero)
+    : ExitLimit(E, M, MaxOrZero, None) {}
+
 /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
 /// computable exit into a persistent ExitNotTakenInfo array.
 ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
@@ -6637,13 +6676,12 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit(
     // {K,ashr,<positive-constant>} stabilizes to signum(K) in at most
     // bitwidth(K) iterations.
     Value *FirstValue = PN->getIncomingValueForBlock(Predecessor);
-    bool KnownZero, KnownOne;
-    ComputeSignBit(FirstValue, KnownZero, KnownOne, DL, 0, nullptr,
-                   Predecessor->getTerminator(), &DT);
+    KnownBits Known = computeKnownBits(FirstValue, DL, 0, nullptr,
+                                       Predecessor->getTerminator(), &DT);
     auto *Ty = cast<IntegerType>(RHS->getType());
-    if (KnownZero)
+    if (Known.isNonNegative())
       StableValue = ConstantInt::get(Ty, 0);
-    else if (KnownOne)
+    else if (Known.isNegative())
       StableValue = ConstantInt::get(Ty, -1, true);
     else
       return getCouldNotCompute();
@@ -7377,48 +7415,49 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
   const APInt &N = NC->getAPInt();
   APInt Two(BitWidth, 2);
 
-  {
-    using namespace APIntOps;
-    const APInt& C = L;
-    // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
-    // The B coefficient is M-N/2
-    APInt B(M);
-    B -= N.sdiv(Two);
+  // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
 
-    // The A coefficient is N/2
-    APInt A(N.sdiv(Two));
+  // The A coefficient is N/2
+  APInt A = N.sdiv(Two);
 
-    // Compute the B^2-4ac term.
-    APInt SqrtTerm(B);
-    SqrtTerm *= B;
-    SqrtTerm -= 4 * (A * C);
+  // The B coefficient is M-N/2
+  APInt B = M;
+  B -= A; // A is the same as N/2.
 
-    if (SqrtTerm.isNegative()) {
-      // The loop is provably infinite.
-      return None;
-    }
+  // The C coefficient is L.
+  const APInt& C = L;
 
-    // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
-    // integer value or else APInt::sqrt() will assert.
-    APInt SqrtVal(SqrtTerm.sqrt());
+  // Compute the B^2-4ac term.
+  APInt SqrtTerm = B;
+  SqrtTerm *= B;
+  SqrtTerm -= 4 * (A * C);
 
-    // Compute the two solutions for the quadratic formula.
-    // The divisions must be performed as signed divisions.
-    APInt NegB(-B);
-    APInt TwoA(A << 1);
-    if (TwoA.isMinValue())
-      return None;
+  if (SqrtTerm.isNegative()) {
+    // The loop is provably infinite.
+    return None;
+  }
+
+  // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
+  // integer value or else APInt::sqrt() will assert.
+  APInt SqrtVal = SqrtTerm.sqrt();
+
+  // Compute the two solutions for the quadratic formula.
+  // The divisions must be performed as signed divisions.
+  APInt NegB = -std::move(B);
+  APInt TwoA = std::move(A);
+  TwoA <<= 1;
+  if (TwoA.isNullValue())
+    return None;
 
-    LLVMContext &Context = SE.getContext();
+  LLVMContext &Context = SE.getContext();
 
-    ConstantInt *Solution1 =
-      ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
-    ConstantInt *Solution2 =
-      ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
+  ConstantInt *Solution1 =
+    ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
+  ConstantInt *Solution2 =
+    ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
 
-    return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
-                          cast<SCEVConstant>(SE.getConstant(Solution2)));
-  } // end APIntOps namespace
+  return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
+                        cast<SCEVConstant>(SE.getConstant(Solution2)));
 }
 
 ScalarEvolution::ExitLimit
@@ -8976,7 +9015,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
                                 .getSignedMax();
 
     // SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow!
-    return (std::move(MaxValue) - std::move(MaxStrideMinusOne)).slt(MaxRHS);
+    return (std::move(MaxValue) - MaxStrideMinusOne).slt(MaxRHS);
   }
 
   APInt MaxRHS = getUnsignedRange(RHS).getUnsignedMax();
@@ -8985,7 +9024,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
                               .getUnsignedMax();
 
   // UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow!
-  return (std::move(MaxValue) - std::move(MaxStrideMinusOne)).ult(MaxRHS);
+  return (std::move(MaxValue) - MaxStrideMinusOne).ult(MaxRHS);
 }
 
 bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
@@ -9002,7 +9041,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
                                .getSignedMax();
 
     // SMinRHS - SMaxStrideMinusOne < SMinValue => overflow!
-    return (std::move(MinValue) + std::move(MaxStrideMinusOne)).sgt(MinRHS);
+    return (std::move(MinValue) + MaxStrideMinusOne).sgt(MinRHS);
   }
 
   APInt MinRHS = getUnsignedRange(RHS).getUnsignedMin();
@@ -9011,7 +9050,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
                             .getUnsignedMax();
 
   // UMinRHS - UMaxStrideMinusOne < UMinValue => overflow!
-  return (std::move(MinValue) + std::move(MaxStrideMinusOne)).ugt(MinRHS);
+  return (std::move(MinValue) + MaxStrideMinusOne).ugt(MinRHS);
 }
 
 const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step,
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index 848e1b4717b5..3cf1bbc5daa5 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -241,6 +241,50 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
       TLI.setUnavailable(LibFunc_tanhf);
     }
 
+    // These definitions are due to math-finite.h header on Linux
+    TLI.setUnavailable(LibFunc_acos_finite);
+    TLI.setUnavailable(LibFunc_acosf_finite);
+    TLI.setUnavailable(LibFunc_acosl_finite);
+    TLI.setUnavailable(LibFunc_acosh_finite);
+    TLI.setUnavailable(LibFunc_acoshf_finite);
+    TLI.setUnavailable(LibFunc_acoshl_finite);
+    TLI.setUnavailable(LibFunc_asin_finite);
+    TLI.setUnavailable(LibFunc_asinf_finite);
+    TLI.setUnavailable(LibFunc_asinl_finite);
+    TLI.setUnavailable(LibFunc_atan2_finite);
+    TLI.setUnavailable(LibFunc_atan2f_finite);
+    TLI.setUnavailable(LibFunc_atan2l_finite);
+    TLI.setUnavailable(LibFunc_atanh_finite);
+    TLI.setUnavailable(LibFunc_atanhf_finite);
+    TLI.setUnavailable(LibFunc_atanhl_finite);
+    TLI.setUnavailable(LibFunc_cosh_finite);
+    TLI.setUnavailable(LibFunc_coshf_finite);
+    TLI.setUnavailable(LibFunc_coshl_finite);
+    TLI.setUnavailable(LibFunc_exp10_finite);
+    TLI.setUnavailable(LibFunc_exp10f_finite);
+    TLI.setUnavailable(LibFunc_exp10l_finite);
+    TLI.setUnavailable(LibFunc_exp2_finite);
+    TLI.setUnavailable(LibFunc_exp2f_finite);
+    TLI.setUnavailable(LibFunc_exp2l_finite);
+    TLI.setUnavailable(LibFunc_exp_finite);
+    TLI.setUnavailable(LibFunc_expf_finite);
+    TLI.setUnavailable(LibFunc_expl_finite);
+    TLI.setUnavailable(LibFunc_log10_finite);
+    TLI.setUnavailable(LibFunc_log10f_finite);
+    TLI.setUnavailable(LibFunc_log10l_finite);
+    TLI.setUnavailable(LibFunc_log2_finite);
+    TLI.setUnavailable(LibFunc_log2f_finite);
+    TLI.setUnavailable(LibFunc_log2l_finite);
+    TLI.setUnavailable(LibFunc_log_finite);
+    TLI.setUnavailable(LibFunc_logf_finite);
+    TLI.setUnavailable(LibFunc_logl_finite);
+    TLI.setUnavailable(LibFunc_pow_finite);
+    TLI.setUnavailable(LibFunc_powf_finite);
+    TLI.setUnavailable(LibFunc_powl_finite);
+    TLI.setUnavailable(LibFunc_sinh_finite);
+    TLI.setUnavailable(LibFunc_sinhf_finite);
+    TLI.setUnavailable(LibFunc_sinhl_finite);
+
     // Win32 does *not* provide provide these functions, but they are
     // generally available on POSIX-compliant systems:
     TLI.setUnavailable(LibFunc_access);
@@ -496,7 +540,7 @@ static StringRef sanitizeFunctionName(StringRef funcName) {
 
   // Check for \01 prefix that is used to mangle __asm declarations and
   // strip it if present.
-  return GlobalValue::getRealLinkageName(funcName);
+  return GlobalValue::dropLLVMManglingEscape(funcName);
 }
 
 bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
@@ -1004,22 +1048,34 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams == 1 && FTy.getParamType(0)->isFloatingPointTy());
 
   case LibFunc_acos:
+  case LibFunc_acos_finite:
   case LibFunc_acosf:
+  case LibFunc_acosf_finite:
   case LibFunc_acosh:
+  case LibFunc_acosh_finite:
   case LibFunc_acoshf:
+  case LibFunc_acoshf_finite:
   case LibFunc_acoshl:
+  case LibFunc_acoshl_finite:
   case LibFunc_acosl:
+  case LibFunc_acosl_finite:
   case LibFunc_asin:
+  case LibFunc_asin_finite:
   case LibFunc_asinf:
+  case LibFunc_asinf_finite:
   case LibFunc_asinh:
   case LibFunc_asinhf:
   case LibFunc_asinhl:
   case LibFunc_asinl:
+  case LibFunc_asinl_finite:
   case LibFunc_atan:
   case LibFunc_atanf:
   case LibFunc_atanh:
+  case LibFunc_atanh_finite:
   case LibFunc_atanhf:
+  case LibFunc_atanhf_finite:
   case LibFunc_atanhl:
+  case LibFunc_atanhl_finite:
   case LibFunc_atanl:
   case LibFunc_cbrt:
   case LibFunc_cbrtf:
@@ -1030,18 +1086,30 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_cos:
   case LibFunc_cosf:
   case LibFunc_cosh:
+  case LibFunc_cosh_finite:
   case LibFunc_coshf:
+  case LibFunc_coshf_finite:
   case LibFunc_coshl:
+  case LibFunc_coshl_finite:
   case LibFunc_cosl:
   case LibFunc_exp10:
+  case LibFunc_exp10_finite:
   case LibFunc_exp10f:
+  case LibFunc_exp10f_finite:
   case LibFunc_exp10l:
+  case LibFunc_exp10l_finite:
   case LibFunc_exp2:
+  case LibFunc_exp2_finite:
   case LibFunc_exp2f:
+  case LibFunc_exp2f_finite:
   case LibFunc_exp2l:
+  case LibFunc_exp2l_finite:
   case LibFunc_exp:
+  case LibFunc_exp_finite:
   case LibFunc_expf:
+  case LibFunc_expf_finite:
   case LibFunc_expl:
+  case LibFunc_expl_finite:
   case LibFunc_expm1:
   case LibFunc_expm1f:
   case LibFunc_expm1l:
@@ -1052,20 +1120,29 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_floorf:
   case LibFunc_floorl:
   case LibFunc_log10:
+  case LibFunc_log10_finite:
   case LibFunc_log10f:
+  case LibFunc_log10f_finite:
   case LibFunc_log10l:
+  case LibFunc_log10l_finite:
   case LibFunc_log1p:
   case LibFunc_log1pf:
   case LibFunc_log1pl:
   case LibFunc_log2:
+  case LibFunc_log2_finite:
   case LibFunc_log2f:
+  case LibFunc_log2f_finite:
   case LibFunc_log2l:
+  case LibFunc_log2l_finite:
   case LibFunc_log:
+  case LibFunc_log_finite:
   case LibFunc_logb:
   case LibFunc_logbf:
   case LibFunc_logbl:
   case LibFunc_logf:
+  case LibFunc_logf_finite:
   case LibFunc_logl:
+  case LibFunc_logl_finite:
   case LibFunc_nearbyint:
   case LibFunc_nearbyintf:
   case LibFunc_nearbyintl:
@@ -1078,8 +1155,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_sin:
   case LibFunc_sinf:
   case LibFunc_sinh:
+  case LibFunc_sinh_finite:
   case LibFunc_sinhf:
+  case LibFunc_sinhf_finite:
   case LibFunc_sinhl:
+  case LibFunc_sinhl_finite:
   case LibFunc_sinl:
   case LibFunc_sqrt:
   case LibFunc_sqrt_finite:
@@ -1100,8 +1180,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
             FTy.getReturnType() == FTy.getParamType(0));
 
   case LibFunc_atan2:
+  case LibFunc_atan2_finite:
   case LibFunc_atan2f:
+  case LibFunc_atan2f_finite:
   case LibFunc_atan2l:
+  case LibFunc_atan2l_finite:
   case LibFunc_fmin:
   case LibFunc_fminf:
   case LibFunc_fminl:
@@ -1115,8 +1198,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_copysignf:
   case LibFunc_copysignl:
   case LibFunc_pow:
+  case LibFunc_pow_finite:
   case LibFunc_powf:
+  case LibFunc_powf_finite:
   case LibFunc_powl:
+  case LibFunc_powl_finite:
     return (NumParams == 2 && FTy.getReturnType()->isFloatingPointTy() &&
             FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getReturnType() == FTy.getParamType(1));
@@ -1294,6 +1380,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
         {"powf", "__svml_powf8", 8},
         {"powf", "__svml_powf16", 16},
 
+        { "__pow_finite", "__svml_pow2", 2 },
+        { "__pow_finite", "__svml_pow4", 4 },
+        { "__pow_finite", "__svml_pow8", 8 },
+
+        { "__powf_finite", "__svml_powf4", 4 },
+        { "__powf_finite", "__svml_powf8", 8 },
+        { "__powf_finite", "__svml_powf16", 16 },
+
         {"llvm.pow.f64", "__svml_pow2", 2},
         {"llvm.pow.f64", "__svml_pow4", 4},
         {"llvm.pow.f64", "__svml_pow8", 8},
@@ -1310,6 +1404,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
         {"expf", "__svml_expf8", 8},
         {"expf", "__svml_expf16", 16},
 
+        { "__exp_finite", "__svml_exp2", 2 },
+        { "__exp_finite", "__svml_exp4", 4 },
+        { "__exp_finite", "__svml_exp8", 8 },
+
+        { "__expf_finite", "__svml_expf4", 4 },
+        { "__expf_finite", "__svml_expf8", 8 },
+        { "__expf_finite", "__svml_expf16", 16 },
+
         {"llvm.exp.f64", "__svml_exp2", 2},
         {"llvm.exp.f64", "__svml_exp4", 4},
         {"llvm.exp.f64", "__svml_exp8", 8},
@@ -1326,6 +1428,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
         {"logf", "__svml_logf8", 8},
         {"logf", "__svml_logf16", 16},
 
+        { "__log_finite", "__svml_log2", 2 },
+        { "__log_finite", "__svml_log4", 4 },
+        { "__log_finite", "__svml_log8", 8 },
+
+        { "__logf_finite", "__svml_logf4", 4 },
+        { "__logf_finite", "__svml_logf8", 8 },
+        { "__logf_finite", "__svml_logf16", 16 },
+
         {"llvm.log.f64", "__svml_log2", 2},
         {"llvm.log.f64", "__svml_log4", 4},
         {"llvm.log.f64", "__svml_log8", 8},
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 26d606cce9bb..8a5d10473662 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -279,6 +279,10 @@ unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
   return TTIImpl->getRegisterBitWidth(Vector);
 }
 
+unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
+  return TTIImpl->getMinVectorRegisterBitWidth();
+}
+
 bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
   return TTIImpl->shouldConsiderAddressTypePromotion(
@@ -500,6 +504,15 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
   return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
 }
 
+bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode,
+                                                Type *Ty, ReductionFlags Flags) const {
+  return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
+}
+
+bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
+  return TTIImpl->shouldExpandReduction(II);
+}
+
 TargetTransformInfo::Concept::~Concept() {}
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index a7f3ff672aef..cba7363a0afa 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -88,9 +88,8 @@ struct Query {
   /// classic case of this is assume(x = y), which will attempt to determine
   /// bits in x from bits in y, which will attempt to determine bits in y from
   /// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
-  /// isKnownNonZero, which calls computeKnownBits and ComputeSignBit and
-  /// isKnownToBeAPowerOfTwo (all of which can call computeKnownBits), and so
-  /// on.
+  /// isKnownNonZero, which calls computeKnownBits and isKnownToBeAPowerOfTwo
+  /// (all of which can call computeKnownBits), and so on.
   std::array<const Value *, MaxDepth> Excluded;
   unsigned NumExcluded;
 
@@ -143,6 +142,16 @@ void llvm::computeKnownBits(const Value *V, KnownBits &Known,
                      Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
 }
 
+static KnownBits computeKnownBits(const Value *V, unsigned Depth,
+                                  const Query &Q);
+
+KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
+                                 unsigned Depth, AssumptionCache *AC,
+                                 const Instruction *CxtI,
+                                 const DominatorTree *DT) {
+  return ::computeKnownBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
+}
+
 bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
                                const DataLayout &DL,
                                AssumptionCache *AC, const Instruction *CxtI,
@@ -159,16 +168,6 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
   return (LHSKnown.Zero | RHSKnown.Zero).isAllOnesValue();
 }
 
-static void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
-                           unsigned Depth, const Query &Q);
-
-void llvm::ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
-                          const DataLayout &DL, unsigned Depth,
-                          AssumptionCache *AC, const Instruction *CxtI,
-                          const DominatorTree *DT) {
-  ::ComputeSignBit(V, KnownZero, KnownOne, Depth,
-                   Query(DL, AC, safeCxtI(V, CxtI), DT));
-}
 
 static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
                                    const Query &Q);
@@ -194,9 +193,8 @@ bool llvm::isKnownNonNegative(const Value *V, const DataLayout &DL,
                               unsigned Depth,
                               AssumptionCache *AC, const Instruction *CxtI,
                               const DominatorTree *DT) {
-  bool NonNegative, Negative;
-  ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
-  return NonNegative;
+  KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
+  return Known.isNonNegative();
 }
 
 bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
@@ -214,9 +212,8 @@ bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
 bool llvm::isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth,
                            AssumptionCache *AC, const Instruction *CxtI,
                            const DominatorTree *DT) {
-  bool NonNegative, Negative;
-  ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
-  return Negative;
+  KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
+  return Known.isNegative();
 }
 
 static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q);
@@ -342,10 +339,10 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
   // Also compute a conservative estimate for high known-0 bits.
   // More trickiness is possible, but this is sufficient for the
   // interesting case of alignment computation.
-  unsigned TrailZ = Known.Zero.countTrailingOnes() +
-                    Known2.Zero.countTrailingOnes();
-  unsigned LeadZ =  std::max(Known.Zero.countLeadingOnes() +
-                             Known2.Zero.countLeadingOnes(),
+  unsigned TrailZ = Known.countMinTrailingZeros() +
+                    Known2.countMinTrailingZeros();
+  unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
+                             Known2.countMinLeadingZeros(),
                              BitWidth) - BitWidth;
 
   TrailZ = std::min(TrailZ, BitWidth);
@@ -750,8 +747,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       // Whatever high bits in c are zero are known to be zero.
-      Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes());
-    // assume(v <_u c)
+      Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
+      // assume(v <_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULT &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
@@ -761,9 +758,9 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       // Whatever high bits in c are zero are known to be zero (if c is a power
       // of 2, then one more).
       if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
-        Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes()+1);
+        Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros() + 1);
       else
-        Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes());
+        Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
     }
   }
 
@@ -916,7 +913,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
                                        m_Value(Y))))) {
       Known2.resetAll();
       computeKnownBits(Y, Known2, Depth + 1, Q);
-      if (Known2.One.countTrailingOnes() > 0)
+      if (Known2.countMinTrailingOnes() > 0)
         Known.Zero.setBit(0);
     }
     break;
@@ -953,14 +950,13 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
-    unsigned LeadZ = Known2.Zero.countLeadingOnes();
+    unsigned LeadZ = Known2.countMinLeadingZeros();
 
     Known2.resetAll();
     computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
-    unsigned RHSUnknownLeadingOnes = Known2.One.countLeadingZeros();
-    if (RHSUnknownLeadingOnes != BitWidth)
-      LeadZ = std::min(BitWidth,
-                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+    unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
+    if (RHSMaxLeadingZeros != BitWidth)
+      LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
 
     Known.Zero.setHighBits(LeadZ);
     break;
@@ -983,8 +979,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       if (Known.isNegative() && Known2.isNegative())
         // We can derive a lower bound on the result by taking the max of the
         // leading one bits.
-        MaxHighOnes = std::max(Known.One.countLeadingOnes(),
-                               Known2.One.countLeadingOnes());
+        MaxHighOnes =
+            std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
       // If either side is non-negative, the result is non-negative.
       else if (Known.isNonNegative() || Known2.isNonNegative())
         MaxHighZeros = 1;
@@ -993,8 +989,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       if (Known.isNonNegative() && Known2.isNonNegative())
         // We can derive an upper bound on the result by taking the max of the
         // leading zero bits.
-        MaxHighZeros = std::max(Known.Zero.countLeadingOnes(),
-                                Known2.Zero.countLeadingOnes());
+        MaxHighZeros = std::max(Known.countMinLeadingZeros(),
+                                Known2.countMinLeadingZeros());
       // If either side is negative, the result is negative.
       else if (Known.isNegative() || Known2.isNegative())
         MaxHighOnes = 1;
@@ -1002,12 +998,12 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       // We can derive a lower bound on the result by taking the max of the
       // leading one bits.
       MaxHighOnes =
-          std::max(Known.One.countLeadingOnes(), Known2.One.countLeadingOnes());
+          std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
     } else if (SPF == SPF_UMIN) {
       // We can derive an upper bound on the result by taking the max of the
       // leading zero bits.
       MaxHighZeros =
-          std::max(Known.Zero.countLeadingOnes(), Known2.Zero.countLeadingOnes());
+          std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     }
 
     // Only known if known in both the LHS and RHS.
@@ -1185,8 +1181,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
 
-    unsigned Leaders = std::max(Known.Zero.countLeadingOnes(),
-                                Known2.Zero.countLeadingOnes());
+    unsigned Leaders =
+        std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     Known.resetAll();
     Known.Zero.setHighBits(Leaders);
     break;
@@ -1207,7 +1203,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     // to determine if we can prove known low zero bits.
     KnownBits LocalKnown(BitWidth);
     computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q);
-    unsigned TrailZ = LocalKnown.Zero.countTrailingOnes();
+    unsigned TrailZ = LocalKnown.countMinTrailingZeros();
 
     gep_type_iterator GTI = gep_type_begin(I);
     for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
@@ -1241,7 +1237,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
         computeKnownBits(Index, LocalKnown, Depth + 1, Q);
         TrailZ = std::min(TrailZ,
                           unsigned(countTrailingZeros(TypeSize) +
-                                   LocalKnown.Zero.countTrailingOnes()));
+                                   LocalKnown.countMinTrailingZeros()));
       }
     }
 
@@ -1286,8 +1282,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
           KnownBits Known3(Known);
           computeKnownBits(L, Known3, Depth + 1, Q);
 
-          Known.Zero.setLowBits(std::min(Known2.Zero.countTrailingOnes(),
-                                         Known3.Zero.countTrailingOnes()));
+          Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(),
+                                         Known3.countMinTrailingZeros()));
 
           if (DontImproveNonNegativePhiBits)
             break;
@@ -1386,12 +1382,25 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
         Known.Zero |= Known2.Zero.byteSwap();
         Known.One |= Known2.One.byteSwap();
         break;
-      case Intrinsic::ctlz:
+      case Intrinsic::ctlz: {
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        // If we have a known 1, its position is our upper bound.
+        unsigned PossibleLZ = Known2.One.countLeadingZeros();
+        // If this call is undefined for 0, the result will be less than 2^n.
+        if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
+          PossibleLZ = std::min(PossibleLZ, BitWidth - 1);
+        unsigned LowBits = Log2_32(PossibleLZ)+1;
+        Known.Zero.setBitsFrom(LowBits);
+        break;
+      }
       case Intrinsic::cttz: {
-        unsigned LowBits = Log2_32(BitWidth)+1;
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        // If we have a known 1, its position is our upper bound.
+        unsigned PossibleTZ = Known2.One.countTrailingZeros();
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
-          LowBits -= 1;
+          PossibleTZ = std::min(PossibleTZ, BitWidth - 1);
+        unsigned LowBits = Log2_32(PossibleTZ)+1;
         Known.Zero.setBitsFrom(LowBits);
         break;
       }
@@ -1399,7 +1408,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // We can bound the space the count needs.  Also, bits known to be zero
         // can't contribute to the population.
-        unsigned BitsPossiblySet = BitWidth - Known2.Zero.countPopulation();
+        unsigned BitsPossiblySet = Known2.countMaxPopulation();
         unsigned LowBits = Log2_32(BitsPossiblySet)+1;
         Known.Zero.setBitsFrom(LowBits);
         // TODO: we could bound KnownOne using the lower bound on the number
@@ -1450,6 +1459,14 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
 }
 
 /// Determine which bits of V are known to be either zero or one and return
+/// them.
+KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
+  KnownBits Known(getBitWidth(V->getType(), Q.DL));
+  computeKnownBits(V, Known, Depth, Q);
+  return Known;
+}
+
+/// Determine which bits of V are known to be either zero or one and return
 /// them in the Known bit set.
 ///
 /// NOTE: we cannot consider 'undef' to be "IsZero" here.  The problem is that
@@ -1568,16 +1585,6 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
   assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
 }
 
-/// Determine whether the sign bit is known to be zero or one.
-/// Convenience wrapper around computeKnownBits.
-void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
-                    unsigned Depth, const Query &Q) {
-  KnownBits Bits(getBitWidth(V->getType(), Q.DL));
-  computeKnownBits(V, Bits, Depth, Q);
-  KnownOne = Bits.isNegative();
-  KnownZero = Bits.isNonNegative();
-}
-
 /// Return true if the given value is known to have exactly one
 /// bit set when defined. For vectors return true if every element is known to
 /// be a power of two when defined. Supports values with integer or pointer
@@ -1842,24 +1849,20 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
     if (BO->isExact())
       return isKnownNonZero(X, Depth, Q);
 
-    bool XKnownNonNegative, XKnownNegative;
-    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
-    if (XKnownNegative)
+    KnownBits Known = computeKnownBits(X, Depth, Q);
+    if (Known.isNegative())
       return true;
 
     // If the shifter operand is a constant, and all of the bits shifted
     // out are known to be zero, and X is known non-zero then at least one
     // non-zero bit must remain.
     if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) {
-      KnownBits Known(BitWidth);
-      computeKnownBits(X, Known, Depth, Q);
-
       auto ShiftVal = Shift->getLimitedValue(BitWidth - 1);
       // Is there a known one in the portion not shifted out?
-      if (Known.One.countLeadingZeros() < BitWidth - ShiftVal)
+      if (Known.countMaxLeadingZeros() < BitWidth - ShiftVal)
         return true;
       // Are all the bits to be shifted out known zero?
-      if (Known.Zero.countTrailingOnes() >= ShiftVal)
+      if (Known.countMinTrailingZeros() >= ShiftVal)
         return isKnownNonZero(X, Depth, Q);
     }
   }
@@ -1869,39 +1872,34 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
   }
   // X + Y.
   else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
-    bool XKnownNonNegative, XKnownNegative;
-    bool YKnownNonNegative, YKnownNegative;
-    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
-    ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Depth, Q);
+    KnownBits XKnown = computeKnownBits(X, Depth, Q);
+    KnownBits YKnown = computeKnownBits(Y, Depth, Q);
 
     // If X and Y are both non-negative (as signed values) then their sum is not
     // zero unless both X and Y are zero.
-    if (XKnownNonNegative && YKnownNonNegative)
+    if (XKnown.isNonNegative() && YKnown.isNonNegative())
       if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q))
         return true;
 
     // If X and Y are both negative (as signed values) then their sum is not
     // zero unless both X and Y equal INT_MIN.
-    if (XKnownNegative && YKnownNegative) {
-      KnownBits Known(BitWidth);
+    if (XKnown.isNegative() && YKnown.isNegative()) {
       APInt Mask = APInt::getSignedMaxValue(BitWidth);
       // The sign bit of X is set.  If some other bit is set then X is not equal
       // to INT_MIN.
-      computeKnownBits(X, Known, Depth, Q);
-      if (Known.One.intersects(Mask))
+      if (XKnown.One.intersects(Mask))
         return true;
       // The sign bit of Y is set.  If some other bit is set then Y is not equal
       // to INT_MIN.
-      computeKnownBits(Y, Known, Depth, Q);
-      if (Known.One.intersects(Mask))
+      if (YKnown.One.intersects(Mask))
         return true;
     }
 
     // The sum of a non-negative number and a power of two is not zero.
-    if (XKnownNonNegative &&
+    if (XKnown.isNonNegative() &&
         isKnownToBeAPowerOfTwo(Y, /*OrZero*/ false, Depth, Q))
       return true;
-    if (YKnownNonNegative &&
+    if (YKnown.isNonNegative() &&
         isKnownToBeAPowerOfTwo(X, /*OrZero*/ false, Depth, Q))
       return true;
   }
@@ -2276,14 +2274,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
 
   // If we know that the sign bit is either zero or one, determine the number of
   // identical bits in the top of the input value.
-  if (Known.isNonNegative())
-    return std::max(FirstAnswer, Known.Zero.countLeadingOnes());
-
-  if (Known.isNegative())
-    return std::max(FirstAnswer, Known.One.countLeadingOnes());
-
-  // computeKnownBits gave us no extra information about the top bits.
-  return FirstAnswer;
+  return std::max(FirstAnswer, Known.countMinSignBits());
 }
 
 /// This function computes the integer multiple of Base that equals V.
@@ -3441,8 +3432,8 @@ OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
   computeKnownBits(RHS, RHSKnown, DL, /*Depth=*/0, AC, CxtI, DT);
   // Note that underestimating the number of zero bits gives a more
   // conservative answer.
-  unsigned ZeroBits = LHSKnown.Zero.countLeadingOnes() +
-                      RHSKnown.Zero.countLeadingOnes();
+  unsigned ZeroBits = LHSKnown.countMinLeadingZeros() +
+                      RHSKnown.countMinLeadingZeros();
   // First handle the easy case: if we have enough zero bits there's
   // definitely no overflow.
   if (ZeroBits >= BitWidth)
@@ -3475,21 +3466,17 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
                                                    AssumptionCache *AC,
                                                    const Instruction *CxtI,
                                                    const DominatorTree *DT) {
-  bool LHSKnownNonNegative, LHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
-                 AC, CxtI, DT);
-  if (LHSKnownNonNegative || LHSKnownNegative) {
-    bool RHSKnownNonNegative, RHSKnownNegative;
-    ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
-                   AC, CxtI, DT);
-
-    if (LHSKnownNegative && RHSKnownNegative) {
+  KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+  if (LHSKnown.isNonNegative() || LHSKnown.isNegative()) {
+    KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
+
+    if (LHSKnown.isNegative() && RHSKnown.isNegative()) {
       // The sign bit is set in both cases: this MUST overflow.
       // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::AlwaysOverflows;
     }
 
-    if (LHSKnownNonNegative && RHSKnownNonNegative) {
+    if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) {
       // The sign bit is clear in both cases: this CANNOT overflow.
       // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::NeverOverflows;
@@ -3499,6 +3486,51 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
   return OverflowResult::MayOverflow;
 }
 
+/// \brief Return true if we can prove that adding the two values of the
+/// knownbits will not overflow.
+/// Otherwise return false.
+static bool checkRippleForSignedAdd(const KnownBits &LHSKnown,
+                                    const KnownBits &RHSKnown) {
+  // Addition of two 2's complement numbers having opposite signs will never
+  // overflow.
+  if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
+      (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
+    return true;
+
+  // If either of the values is known to be non-negative, adding them can only
+  // overflow if the second is also non-negative, so we can assume that.
+  // Two non-negative numbers will only overflow if there is a carry to the 
+  // sign bit, so we can check if even when the values are as big as possible
+  // there is no overflow to the sign bit.
+  if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
+    APInt MaxLHS = ~LHSKnown.Zero;
+    MaxLHS.clearSignBit();
+    APInt MaxRHS = ~RHSKnown.Zero;
+    MaxRHS.clearSignBit();
+    APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
+    return Result.isSignBitClear();
+  }
+
+  // If either of the values is known to be negative, adding them can only
+  // overflow if the second is also negative, so we can assume that.
+  // Two negative number will only overflow if there is no carry to the sign
+  // bit, so we can check if even when the values are as small as possible
+  // there is overflow to the sign bit.
+  if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
+    APInt MinLHS = LHSKnown.One;
+    MinLHS.clearSignBit();
+    APInt MinRHS = RHSKnown.One;
+    MinRHS.clearSignBit();
+    APInt Result = std::move(MinLHS) + std::move(MinRHS);
+    return Result.isSignBitSet();
+  }
+
+  // If we reached here it means that we know nothing about the sign bits.
+  // In this case we can't know if there will be an overflow, since by 
+  // changing the sign bits any two values can be made to overflow.
+  return false;
+}
+
 static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
                                                   const Value *RHS,
                                                   const AddOperator *Add,
@@ -3510,18 +3542,29 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
     return OverflowResult::NeverOverflows;
   }
 
-  bool LHSKnownNonNegative, LHSKnownNegative;
-  bool RHSKnownNonNegative, RHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
-                 AC, CxtI, DT);
-  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
-                 AC, CxtI, DT);
+  // If LHS and RHS each have at least two sign bits, the addition will look
+  // like
+  //
+  // XX..... +
+  // YY.....
+  //
+  // If the carry into the most significant position is 0, X and Y can't both
+  // be 1 and therefore the carry out of the addition is also 0.
+  //
+  // If the carry into the most significant position is 1, X and Y can't both
+  // be 0 and therefore the carry out of the addition is also 1.
+  //
+  // Since the carry into the most significant position is always equal to
+  // the carry out of the addition, there is no signed overflow.
+  if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
+      ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1)
+    return OverflowResult::NeverOverflows;
+
+  KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+  KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
 
-  if ((LHSKnownNonNegative && RHSKnownNegative) ||
-      (LHSKnownNegative && RHSKnownNonNegative)) {
-    // The sign bits are opposite: this CANNOT overflow.
+  if (checkRippleForSignedAdd(LHSKnown, RHSKnown))
     return OverflowResult::NeverOverflows;
-  }
 
   // The remaining code needs Add to be available. Early returns if not so.
   if (!Add)
@@ -3532,14 +3575,13 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
   // @llvm.assume'ed non-negative rather than proved so from analyzing its
   // operands.
   bool LHSOrRHSKnownNonNegative =
-      (LHSKnownNonNegative || RHSKnownNonNegative);
-  bool LHSOrRHSKnownNegative = (LHSKnownNegative || RHSKnownNegative);
+      (LHSKnown.isNonNegative() || RHSKnown.isNonNegative());
+  bool LHSOrRHSKnownNegative = 
+      (LHSKnown.isNegative() || RHSKnown.isNegative());
   if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) {
-    bool AddKnownNonNegative, AddKnownNegative;
-    ComputeSignBit(Add, AddKnownNonNegative, AddKnownNegative, DL,
-                   /*Depth=*/0, AC, CxtI, DT);
-    if ((AddKnownNonNegative && LHSOrRHSKnownNonNegative) ||
-        (AddKnownNegative && LHSOrRHSKnownNegative)) {
+    KnownBits AddKnown = computeKnownBits(Add, DL, /*Depth=*/0, AC, CxtI, DT);
+    if ((AddKnown.isNonNegative() && LHSOrRHSKnownNonNegative) ||
+        (AddKnown.isNegative() && LHSOrRHSKnownNegative)) {
       return OverflowResult::NeverOverflows;
     }
   }
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 722f17a8067e..2d2249da4e13 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 97a567565b47..d7602c83435c 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -162,6 +162,10 @@ bool LLParser::ValidateEndOfModule() {
       AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
                             AttributeSet::get(Context, FnAttrs));
       II->setAttributes(AS);
+    } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
+      AttrBuilder Attrs(GV->getAttributes());
+      Attrs.merge(B);
+      GV->setAttributes(AttributeSet::get(Context,Attrs));
     } else {
       llvm_unreachable("invalid object with forward attribute group reference");
     }
@@ -832,10 +836,10 @@ bool LLParser::parseIndirectSymbol(
 /// ParseGlobal
 ///   ::= GlobalVar '=' OptionalLinkage OptionalVisibility OptionalDLLStorageClass
 ///       OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace
-///       OptionalExternallyInitialized GlobalType Type Const
+///       OptionalExternallyInitialized GlobalType Type Const OptionalAttrs
 ///   ::= OptionalLinkage OptionalVisibility OptionalDLLStorageClass
 ///       OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace
-///       OptionalExternallyInitialized GlobalType Type Const
+///       OptionalExternallyInitialized GlobalType Type Const OptionalAttrs
 ///
 /// Everything up to and including OptionalUnnamedAddr has been parsed
 /// already.
@@ -950,6 +954,16 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
     }
   }
 
+  AttrBuilder Attrs;
+  LocTy BuiltinLoc;
+  std::vector<unsigned> FwdRefAttrGrps;
+  if (ParseFnAttributeValuePairs(Attrs, FwdRefAttrGrps, false, BuiltinLoc))
+    return true;
+  if (Attrs.hasAttributes() || !FwdRefAttrGrps.empty()) {
+    GV->setAttributes(AttributeSet::get(Context, Attrs));
+    ForwardRefAttrGroups[GV] = FwdRefAttrGrps;
+  }
+
   return false;
 }
 
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 580261a3b5e0..76298121566a 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -93,13 +93,6 @@ static cl::opt<bool> PrintSummaryGUIDs(
     cl::desc(
         "Print the global id for each value when reading the module summary"));
 
-// FIXME: This flag should either be removed or moved to clang as a driver flag.
-static llvm::cl::opt<bool> IgnoreEmptyThinLTOIndexFile(
-    "ignore-empty-index-file", llvm::cl::ZeroOrMore,
-    llvm::cl::desc(
-        "Ignore an empty index file and perform non-ThinLTO compilation"),
-    llvm::cl::init(false));
-
 namespace {
 
 enum {
@@ -2750,7 +2743,7 @@ Error BitcodeReader::parseComdatRecord(ArrayRef<uint64_t> Record) {
 Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   // v1: [pointer type, isconst, initid, linkage, alignment, section,
   // visibility, threadlocal, unnamed_addr, externally_initialized,
-  // dllstorageclass, comdat] (name in VST)
+  // dllstorageclass, comdat, attributes] (name in VST)
   // v2: [strtab_offset, strtab_size, v1]
   StringRef Name;
   std::tie(Name, Record) = readNameFromStrtab(Record);
@@ -2830,6 +2823,11 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   } else if (hasImplicitComdat(RawLinkage)) {
     NewGV->setComdat(reinterpret_cast<Comdat *>(1));
   }
+
+  if (Record.size() > 12) {
+    auto AS = getAttributes(Record[12]).getFnAttributes();
+    NewGV->setAttributes(AS);
+  }
   return Error::success();
 }
 
@@ -5658,7 +5656,8 @@ Expected<bool> llvm::hasGlobalValueSummary(MemoryBufferRef Buffer) {
 }
 
 Expected<std::unique_ptr<ModuleSummaryIndex>>
-llvm::getModuleSummaryIndexForFile(StringRef Path) {
+llvm::getModuleSummaryIndexForFile(StringRef Path,
+                                   bool IgnoreEmptyThinLTOIndexFile) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Path);
   if (!FileOrErr)
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index 42135e5949ce..d80e1da911ca 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -500,7 +500,7 @@ class MetadataLoader::MetadataLoaderImpl {
 
     // Upgrade variables attached to globals.
     for (auto &GV : TheModule.globals()) {
-      SmallVector<MDNode *, 1> MDs, NewMDs;
+      SmallVector<MDNode *, 1> MDs;
       GV.getMetadata(LLVMContext::MD_dbg, MDs);
       GV.eraseMetadata(LLVMContext::MD_dbg);
       for (auto *MD : MDs)
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1b8d81a60201..1f8b50342c2d 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1109,7 +1109,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     // GLOBALVAR: [strtab offset, strtab size, type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
     //             unnamed_addr, externally_initialized, dllstorageclass,
-    //             comdat]
+    //             comdat, attributes]
     Vals.push_back(StrtabBuilder.add(GV.getName()));
     Vals.push_back(GV.getName().size());
     Vals.push_back(VE.getTypeID(GV.getValueType()));
@@ -1124,13 +1124,17 @@ void ModuleBitcodeWriter::writeModuleInfo() {
         GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None ||
         GV.isExternallyInitialized() ||
         GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass ||
-        GV.hasComdat()) {
+        GV.hasComdat() ||
+        GV.hasAttributes()) {
       Vals.push_back(getEncodedVisibility(GV));
       Vals.push_back(getEncodedThreadLocalMode(GV));
       Vals.push_back(getEncodedUnnamedAddr(GV));
       Vals.push_back(GV.isExternallyInitialized());
       Vals.push_back(getEncodedDLLStorageClass(GV));
       Vals.push_back(GV.hasComdat() ? VE.getComdatID(GV.getComdat()) : 0);
+
+      auto AL = GV.getAttributesAsList(AttributeList::FunctionIndex);
+      Vals.push_back(VE.getAttributeListID(AL));
     } else {
       AbbrevToUse = SimpleGVarAbbrev;
     }
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 861150766986..fd76400331d9 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -314,10 +314,13 @@ ValueEnumerator::ValueEnumerator(const Module &M,
   // Remember what is the cutoff between globalvalue's and other constants.
   unsigned FirstConstant = Values.size();
 
-  // Enumerate the global variable initializers.
-  for (const GlobalVariable &GV : M.globals())
+  // Enumerate the global variable initializers and attributes.
+  for (const GlobalVariable &GV : M.globals()) {
     if (GV.hasInitializer())
       EnumerateValue(GV.getInitializer());
+    if (GV.hasAttributes())
+      EnumerateAttributes(GV.getAttributesAsList(AttributeList::FunctionIndex));
+  }
 
   // Enumerate the aliasees.
   for (const GlobalAlias &GA : M.aliases())
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 76549540ce0f..73fc2b35fe4e 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -21,5 +21,5 @@ add_subdirectory(LineEditor)
 add_subdirectory(ProfileData)
 add_subdirectory(Fuzzer)
 add_subdirectory(Passes)
-add_subdirectory(LibDriver)
+add_subdirectory(ToolDrivers)
 add_subdirectory(XRay)
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 87b45c001de4..98163bffb60b 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -767,7 +767,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
 
   // If our DISubprogram name is empty, use the mangled name.
   if (FuncName.empty())
-    FuncName = GlobalValue::getRealLinkageName(GV->getName());
+    FuncName = GlobalValue::dropLLVMManglingEscape(GV->getName());
 
   // Emit a symbol subsection, required by VS2012+ to find function boundaries.
   OS.AddComment("Symbol subsection for " + Twine(FuncName));
@@ -888,13 +888,21 @@ void CodeViewDebug::collectVariableInfoFromMFTable(
     if (!Scope)
       continue;
 
+    // If the variable has an attached offset expression, extract it.
+    // FIXME: Try to handle DW_OP_deref as well.
+    int64_t ExprOffset = 0;
+    if (VI.Expr)
+      if (!VI.Expr->extractIfOffset(ExprOffset))
+        continue;
+
     // Get the frame register used and the offset.
     unsigned FrameReg = 0;
     int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg);
     uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg);
 
     // Calculate the label ranges.
-    LocalVarDefRange DefRange = createDefRangeMem(CVReg, FrameOffset);
+    LocalVarDefRange DefRange =
+        createDefRangeMem(CVReg, FrameOffset + ExprOffset);
     for (const InsnRange &Range : Scope->getRanges()) {
       const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
       const MCSymbol *End = getLabelAfterInsn(Range.second);
@@ -2194,7 +2202,7 @@ void CodeViewDebug::emitDebugInfoForGlobals() {
         if (GV->hasComdat()) {
           MCSymbol *GVSym = Asm->getSymbol(GV);
           OS.AddComment("Symbol subsection for " +
-                        Twine(GlobalValue::getRealLinkageName(GV->getName())));
+                        Twine(GlobalValue::dropLLVMManglingEscape(GV->getName())));
           switchToDebugSectionForSymbol(GVSym);
           EndLabel = beginCVSubsection(ModuleDebugFragmentKind::Symbols);
           // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions.
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 1d63e33a4d33..826162ad47c4 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -129,10 +129,9 @@ bool hasDebugInfo(const MachineModuleInfo *MMI, const MachineFunction *MF) {
 }
 
 void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
-  assert(Asm);
   PrevInstBB = nullptr;
 
-  if (!hasDebugInfo(MMI, MF)) {
+  if (!Asm || !hasDebugInfo(MMI, MF)) {
     skippedNonDebugFunction();
     return;
   }
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 738e062cb93f..e172712cf889 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -440,7 +440,7 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) {
   auto *InlinedSP = getDISubprogram(DS);
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
-  DIE *OriginDIE = DU->getAbstractSPDies()[InlinedSP];
+  DIE *OriginDIE = getAbstractSPDies()[InlinedSP];
   assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
 
   auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_inlined_subroutine);
@@ -634,7 +634,7 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
 
 void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
     LexicalScope *Scope) {
-  DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()];
+  DIE *&AbsDef = getAbstractSPDies()[Scope->getScopeNode()];
   if (AbsDef)
     return;
 
@@ -696,7 +696,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
 
 void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
   DIE *D = getDIE(SP);
-  if (DIE *AbsSPDIE = DU->getAbstractSPDies().lookup(SP)) {
+  if (DIE *AbsSPDIE = getAbstractSPDies().lookup(SP)) {
     if (D)
       // If this subprogram has an abstract definition, reference that
       addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
@@ -708,6 +708,42 @@ void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
   }
 }
 
+void DwarfCompileUnit::finishVariableDefinition(const DbgVariable &Var) {
+  DbgVariable *AbsVar = getExistingAbstractVariable(
+      InlinedVariable(Var.getVariable(), Var.getInlinedAt()));
+  auto *VariableDie = Var.getDIE();
+  if (AbsVar && AbsVar->getDIE()) {
+    addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
+                      *AbsVar->getDIE());
+  } else
+    applyVariableAttributes(Var, *VariableDie);
+}
+
+DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(InlinedVariable IV) {
+  const DILocalVariable *Cleansed;
+  return getExistingAbstractVariable(IV, Cleansed);
+}
+
+// Find abstract variable, if any, associated with Var.
+DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(
+    InlinedVariable IV, const DILocalVariable *&Cleansed) {
+  // More then one inlined variable corresponds to one abstract variable.
+  Cleansed = IV.first;
+  auto &AbstractVariables = getAbstractVariables();
+  auto I = AbstractVariables.find(Cleansed);
+  if (I != AbstractVariables.end())
+    return I->second.get();
+  return nullptr;
+}
+
+void DwarfCompileUnit::createAbstractVariable(const DILocalVariable *Var,
+                                        LexicalScope *Scope) {
+  assert(Scope && Scope->isAbstractScope());
+  auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
+  DU->addScopeVariable(Scope, AbsDbgVariable.get());
+  getAbstractVariables()[Var] = std::move(AbsDbgVariable);
+}
+
 void DwarfCompileUnit::emitHeader(bool UseOffsets) {
   // Don't bother labeling the .dwo unit, as its offset isn't used.
   if (!Skeleton) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 20a415150b4d..77e9e671529f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -68,6 +68,9 @@ class DwarfCompileUnit final : public DwarfUnit {
   // ranges/locs.
   const MCSymbol *BaseAddress;
 
+  DenseMap<const MDNode *, DIE *> AbstractSPDies;
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
+
   /// \brief Construct a DIE for the given DbgVariable without initializing the
   /// DbgVariable's DIE reference.
   DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract);
@@ -76,6 +79,18 @@ class DwarfCompileUnit final : public DwarfUnit {
 
   bool includeMinimalInlineScopes() const;
 
+  DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return AbstractSPDies;
+    return DU->getAbstractSPDies();
+  }
+
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return AbstractVariables;
+    return DU->getAbstractVariables();
+  }
+
 public:
   DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
                    DwarfDebug *DW, DwarfFile *DWU);
@@ -189,6 +204,13 @@ public:
   DIE *constructImportedEntityDIE(const DIImportedEntity *Module);
 
   void finishSubprogramDefinition(const DISubprogram *SP);
+  void finishVariableDefinition(const DbgVariable &Var);
+  /// Find abstract variable associated with Var.
+  typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
+  DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
+                                           const DILocalVariable *&Cleansed);
+  DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
+  void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
 
   /// Set the skeleton unit associated with this unit.
   void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 6f442f5c3172..3410b98d7776 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -71,6 +71,10 @@ static cl::opt<bool> GenerateARangeSection("generate-arange-section",
                                            cl::desc("Generate dwarf aranges"),
                                            cl::init(false));
 
+static cl::opt<bool> SplitDwarfCrossCuReferences(
+    "split-dwarf-cross-cu-references", cl::Hidden,
+    cl::desc("Enable cross-cu references in DWO files"), cl::init(false));
+
 namespace {
 enum DefaultOnOff { Default, Enable, Disable };
 }
@@ -362,21 +366,29 @@ template <typename Func> static void forBothCUs(DwarfCompileUnit &CU, Func F) {
       F(*SkelCU);
 }
 
-void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
+bool DwarfDebug::shareAcrossDWOCUs() const {
+  return SplitDwarfCrossCuReferences;
+}
+
+void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
+                                                     LexicalScope *Scope) {
   assert(Scope && Scope->getScopeNode());
   assert(Scope->isAbstractScope());
   assert(!Scope->getInlinedAt());
 
   auto *SP = cast<DISubprogram>(Scope->getScopeNode());
 
-  ProcessedSPNodes.insert(SP);
-
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
   auto &CU = *CUMap.lookup(SP->getUnit());
-  forBothCUs(CU, [&](DwarfCompileUnit &CU) {
+  if (auto *SkelCU = CU.getSkeleton()) {
+    (shareAcrossDWOCUs() ? CU : SrcCU)
+        .constructAbstractSubprogramScopeDIE(Scope);
+    if (CU.getCUNode()->getSplitDebugInlining())
+      SkelCU->constructAbstractSubprogramScopeDIE(Scope);
+  } else {
     CU.constructAbstractSubprogramScopeDIE(Scope);
-  });
+  }
 }
 
 void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
@@ -564,13 +576,7 @@ void DwarfDebug::finishVariableDefinitions() {
     // DIE::getUnit isn't simple - it walks parent pointers, etc.
     DwarfCompileUnit *Unit = CUDieMap.lookup(VariableDie->getUnitDie());
     assert(Unit);
-    DbgVariable *AbsVar = getExistingAbstractVariable(
-        InlinedVariable(Var->getVariable(), Var->getInlinedAt()));
-    if (AbsVar && AbsVar->getDIE()) {
-      Unit->addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
-                        *AbsVar->getDIE());
-    } else
-      Unit->applyVariableAttributes(*Var, *VariableDie);
+    Unit->finishVariableDefinition(*Var);
   }
 }
 
@@ -718,58 +724,32 @@ void DwarfDebug::endModule() {
   }
 
   // clean up.
-  AbstractVariables.clear();
+  // FIXME: AbstractVariables.clear();
 }
 
-// Find abstract variable, if any, associated with Var.
-DbgVariable *
-DwarfDebug::getExistingAbstractVariable(InlinedVariable IV,
-                                        const DILocalVariable *&Cleansed) {
-  // More then one inlined variable corresponds to one abstract variable.
-  Cleansed = IV.first;
-  auto I = AbstractVariables.find(Cleansed);
-  if (I != AbstractVariables.end())
-    return I->second.get();
-  return nullptr;
-}
-
-DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) {
-  const DILocalVariable *Cleansed;
-  return getExistingAbstractVariable(IV, Cleansed);
-}
-
-void DwarfDebug::createAbstractVariable(const DILocalVariable *Var,
-                                        LexicalScope *Scope) {
-  assert(Scope && Scope->isAbstractScope());
-  auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
-  InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());
-  AbstractVariables[Var] = std::move(AbsDbgVariable);
-}
-
-void DwarfDebug::ensureAbstractVariableIsCreated(InlinedVariable IV,
+void DwarfDebug::ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable IV,
                                                  const MDNode *ScopeNode) {
   const DILocalVariable *Cleansed = nullptr;
-  if (getExistingAbstractVariable(IV, Cleansed))
+  if (CU.getExistingAbstractVariable(IV, Cleansed))
     return;
 
-  createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(
+  CU.createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(
                                        cast<DILocalScope>(ScopeNode)));
 }
 
-void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(
+void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU,
     InlinedVariable IV, const MDNode *ScopeNode) {
   const DILocalVariable *Cleansed = nullptr;
-  if (getExistingAbstractVariable(IV, Cleansed))
+  if (CU.getExistingAbstractVariable(IV, Cleansed))
     return;
 
   if (LexicalScope *Scope =
           LScopes.findAbstractScope(cast_or_null<DILocalScope>(ScopeNode)))
-    createAbstractVariable(Cleansed, Scope);
+    CU.createAbstractVariable(Cleansed, Scope);
 }
-
 // Collect variable information from side table maintained by MF.
 void DwarfDebug::collectVariableInfoFromMFTable(
-    DenseSet<InlinedVariable> &Processed) {
+    DwarfCompileUnit &TheCU, DenseSet<InlinedVariable> &Processed) {
   for (const auto &VI : Asm->MF->getVariableDbgInfo()) {
     if (!VI.Var)
       continue;
@@ -784,7 +764,7 @@ void DwarfDebug::collectVariableInfoFromMFTable(
     if (!Scope)
       continue;
 
-    ensureAbstractVariableIsCreatedIfScoped(Var, Scope->getScopeNode());
+    ensureAbstractVariableIsCreatedIfScoped(TheCU, Var, Scope->getScopeNode());
     auto RegVar = make_unique<DbgVariable>(Var.first, Var.second);
     RegVar->initializeMMI(VI.Expr, VI.Slot);
     if (InfoHolder.addScopeVariable(Scope, RegVar.get()))
@@ -955,9 +935,10 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
   }
 }
 
-DbgVariable *DwarfDebug::createConcreteVariable(LexicalScope &Scope,
+DbgVariable *DwarfDebug::createConcreteVariable(DwarfCompileUnit &TheCU,
+                                                LexicalScope &Scope,
                                                 InlinedVariable IV) {
-  ensureAbstractVariableIsCreatedIfScoped(IV, Scope.getScopeNode());
+  ensureAbstractVariableIsCreatedIfScoped(TheCU, IV, Scope.getScopeNode());
   ConcreteVariables.push_back(make_unique<DbgVariable>(IV.first, IV.second));
   InfoHolder.addScopeVariable(&Scope, ConcreteVariables.back().get());
   return ConcreteVariables.back().get();
@@ -980,7 +961,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
                                      const DISubprogram *SP,
                                      DenseSet<InlinedVariable> &Processed) {
   // Grab the variable info that was squirreled away in the MMI side-table.
-  collectVariableInfoFromMFTable(Processed);
+  collectVariableInfoFromMFTable(TheCU, Processed);
 
   for (const auto &I : DbgValues) {
     InlinedVariable IV = I.first;
@@ -1002,7 +983,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
       continue;
 
     Processed.insert(IV);
-    DbgVariable *RegVar = createConcreteVariable(*Scope, IV);
+    DbgVariable *RegVar = createConcreteVariable(TheCU, *Scope, IV);
 
     const MachineInstr *MInsn = Ranges.front().first;
     assert(MInsn->isDebugValue() && "History must begin with debug value");
@@ -1038,7 +1019,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
   for (const DILocalVariable *DV : SP->getVariables()) {
     if (Processed.insert(InlinedVariable(DV, nullptr)).second)
       if (LexicalScope *Scope = LScopes.findLexicalScope(DV->getScope()))
-        createConcreteVariable(*Scope, InlinedVariable(DV, nullptr));
+        createConcreteVariable(TheCU, *Scope, InlinedVariable(DV, nullptr));
   }
 }
 
@@ -1229,12 +1210,12 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
     for (const DILocalVariable *DV : SP->getVariables()) {
       if (!ProcessedVars.insert(InlinedVariable(DV, nullptr)).second)
         continue;
-      ensureAbstractVariableIsCreated(InlinedVariable(DV, nullptr),
+      ensureAbstractVariableIsCreated(TheCU, InlinedVariable(DV, nullptr),
                                       DV->getScope());
       assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes
              && "ensureAbstractVariableIsCreated inserted abstract scopes");
     }
-    constructAbstractSubprogramScopeDIE(AScope);
+    constructAbstractSubprogramScopeDIE(TheCU, AScope);
   }
 
   ProcessedSPNodes.insert(SP);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 8a96e7867b6e..b9c5aa9ffb23 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -210,7 +210,6 @@ class DwarfDebug : public DebugHandlerBase {
   DenseMap<const MCSymbol *, uint64_t> SymSize;
 
   /// Collection of abstract variables.
-  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
   SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
 
   /// Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
@@ -313,20 +312,16 @@ class DwarfDebug : public DebugHandlerBase {
 
   typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
 
-  /// Find abstract variable associated with Var.
-  DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
-                                           const DILocalVariable *&Cleansed);
-  DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
-  void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
-  void ensureAbstractVariableIsCreated(InlinedVariable Var,
+  void ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable Var,
                                        const MDNode *Scope);
-  void ensureAbstractVariableIsCreatedIfScoped(InlinedVariable Var,
+  void ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable Var,
                                                const MDNode *Scope);
 
-  DbgVariable *createConcreteVariable(LexicalScope &Scope, InlinedVariable IV);
+  DbgVariable *createConcreteVariable(DwarfCompileUnit &TheCU,
+                                      LexicalScope &Scope, InlinedVariable IV);
 
   /// Construct a DIE for this abstract scope.
-  void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
+  void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope);
 
   void finishVariableDefinitions();
 
@@ -446,7 +441,8 @@ class DwarfDebug : public DebugHandlerBase {
                          const DbgValueHistoryMap::InstrRanges &Ranges);
 
   /// Collect variable information from the side table maintained by MF.
-  void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &P);
+  void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU,
+                                      DenseSet<InlinedVariable> &P);
 
 protected:
   /// Gather pre-function debug information.
@@ -518,6 +514,8 @@ public:
   /// split dwarf proposal support.
   bool useSplitDwarf() const { return HasSplitDwarf; }
 
+  bool shareAcrossDWOCUs() const;
+
   /// Returns the Dwarf Version.
   uint16_t getDwarfVersion() const;
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index d4d2ed277274..54924e9806ed 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -53,6 +53,7 @@ class DwarfFile {
 
   // Collection of abstract subprogram DIEs.
   DenseMap<const MDNode *, DIE *> AbstractSPDies;
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
 
   /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
   /// be shared across CUs, that is why we keep the map here instead
@@ -105,6 +106,9 @@ public:
   DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
     return AbstractSPDies;
   }
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() {
+    return AbstractVariables;
+  }
 
   void insertDIE(const MDNode *TypeMD, DIE *Die) {
     DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 8d25def7772c..667afbb450bd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -173,7 +173,7 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
 }
 
 /// Check whether the DIE for this MDNode can be shared across CUs.
-static bool isShareableAcrossCUs(const DINode *D) {
+bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const {
   // When the MDNode can be part of the type system, the DIE can be shared
   // across CUs.
   // Combining type units and cross-CU DIE sharing is lower value (since
@@ -181,6 +181,8 @@ static bool isShareableAcrossCUs(const DINode *D) {
   // level already) but may be implementable for some value in projects
   // building multiple independent libraries with LTO and then linking those
   // together.
+  if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+    return false;
   return (isa<DIType>(D) ||
           (isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) &&
          !GenerateDwarfTypeUnits;
@@ -645,7 +647,7 @@ void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) {
     addString(Die,
               DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
                                          : dwarf::DW_AT_MIPS_linkage_name,
-              GlobalValue::getRealLinkageName(LinkageName));
+              GlobalValue::dropLLVMManglingEscape(LinkageName));
 }
 
 void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 8fc841703e23..7acad2cbd89f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -65,7 +65,7 @@ public:
 //===----------------------------------------------------------------------===//
 /// This dwarf writer support class manages information associated with a
 /// source file.
-  class DwarfUnit : public DIEUnit {
+class DwarfUnit : public DIEUnit {
 protected:
   /// MDNode for the compile unit.
   const DICompileUnit *CUNode;
@@ -103,6 +103,9 @@ protected:
 
   bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie);
 
+  bool shareAcrossDWOCUs() const;
+  bool isShareableAcrossCUs(const DINode *D) const;
+
 public:
   // Accessors.
   AsmPrinter* getAsmPrinter() const { return Asm; }
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 704f0ac2f191..815658bfb637 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -101,7 +101,7 @@ void WinException::beginFunction(const MachineFunction *MF) {
       // functions may still refer to it.
       const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
       StringRef FLinkageName =
-          GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+          GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName());
       emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName);
     }
     shouldEmitLSDA = hasEHFunclets;
@@ -174,7 +174,7 @@ static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm,
   // their funclet entry block's number.
   const MachineFunction *MF = MBB->getParent();
   const Function *F = MF->getFunction();
-  StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+  StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
   MCContext &Ctx = MF->getContext();
   StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch";
   return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" +
@@ -252,7 +252,7 @@ void WinException::endFunclet() {
         !CurrentFuncletEntry->isCleanupFuncletEntry()) {
       // If this is a C++ catch funclet (or the parent function),
       // emit a reference to the LSDA for the parent function.
-      StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+      StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
       MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol(
           Twine("$cppxdata$", FuncLinkageName));
       Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4);
@@ -536,7 +536,7 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) {
   // Emit a label assignment with the SEH frame offset so we can use it for
   // llvm.x86.seh.recoverfp.
   StringRef FLinkageName =
-      GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+      GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName());
   MCSymbol *ParentFrameOffset =
       Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
   const MCExpr *MCOffset =
@@ -635,7 +635,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   auto &OS = *Asm->OutStreamer;
   const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
 
-  StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+  StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
 
   SmallVector<std::pair<const MCExpr *, int>, 4> IPToStateTable;
   MCSymbol *FuncInfoXData = nullptr;
@@ -942,7 +942,7 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
 void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
   MCStreamer &OS = *Asm->OutStreamer;
   const Function *F = MF->getFunction();
-  StringRef FLinkageName = GlobalValue::getRealLinkageName(F->getName());
+  StringRef FLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
 
   bool VerboseAsm = OS.isVerboseAsm();
   auto AddComment = [&](const Twine &Comment) {
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index 9c19a4fd3c3e..17e6be05eb42 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -47,8 +47,7 @@ namespace {
     bool runOnFunction(Function &F) override;
 
   private:
-    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
-                               bool IsStore, bool IsLoad);
+    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
     IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
     LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
     bool tryExpandAtomicLoad(LoadInst *LI);
@@ -224,22 +223,16 @@ bool AtomicExpand::runOnFunction(Function &F) {
 
     if (TLI->shouldInsertFencesForAtomic(I)) {
       auto FenceOrdering = AtomicOrdering::Monotonic;
-      bool IsStore, IsLoad;
       if (LI && isAcquireOrStronger(LI->getOrdering())) {
         FenceOrdering = LI->getOrdering();
         LI->setOrdering(AtomicOrdering::Monotonic);
-        IsStore = false;
-        IsLoad = true;
       } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
         FenceOrdering = SI->getOrdering();
         SI->setOrdering(AtomicOrdering::Monotonic);
-        IsStore = true;
-        IsLoad = false;
       } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
                           isAcquireOrStronger(RMWI->getOrdering()))) {
         FenceOrdering = RMWI->getOrdering();
         RMWI->setOrdering(AtomicOrdering::Monotonic);
-        IsStore = IsLoad = true;
       } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
                  (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
                   isAcquireOrStronger(CASI->getSuccessOrdering()))) {
@@ -250,11 +243,10 @@ bool AtomicExpand::runOnFunction(Function &F) {
         FenceOrdering = CASI->getSuccessOrdering();
         CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
         CASI->setFailureOrdering(AtomicOrdering::Monotonic);
-        IsStore = IsLoad = true;
       }
 
       if (FenceOrdering != AtomicOrdering::Monotonic) {
-        MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
+        MadeChange |= bracketInstWithFences(I, FenceOrdering);
       }
     }
 
@@ -320,13 +312,12 @@ bool AtomicExpand::runOnFunction(Function &F) {
   return MadeChange;
 }
 
-bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
-                                         bool IsStore, bool IsLoad) {
+bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) {
   IRBuilder<> Builder(I);
 
-  auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
+  auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
 
-  auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
+  auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
   // The trailing fence is emitted before the instruction instead of after
   // because there is no easy way of setting Builder insertion point after
   // an instruction. So we must erase it from the BB, and insert it back
@@ -1048,8 +1039,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   std::prev(BB->end())->eraseFromParent();
   Builder.SetInsertPoint(BB);
   if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
-    TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
-                          /*IsLoad=*/true);
+    TLI->emitLeadingFence(Builder, CI, SuccessOrder);
   Builder.CreateBr(StartBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
@@ -1064,8 +1054,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 
   Builder.SetInsertPoint(ReleasingStoreBB);
   if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
-    TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
-                          /*IsLoad=*/true);
+    TLI->emitLeadingFence(Builder, CI, SuccessOrder);
   Builder.CreateBr(TryStoreBB);
 
   Builder.SetInsertPoint(TryStoreBB);
@@ -1094,8 +1083,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   // necessary.
   Builder.SetInsertPoint(SuccessBB);
   if (ShouldInsertFencesForAtomic)
-    TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
-                           /*IsLoad=*/true);
+    TLI->emitTrailingFence(Builder, CI, SuccessOrder);
   Builder.CreateBr(ExitBB);
 
   Builder.SetInsertPoint(NoStoreBB);
@@ -1107,8 +1095,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 
   Builder.SetInsertPoint(FailureBB);
   if (ShouldInsertFencesForAtomic)
-    TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
-                           /*IsLoad=*/true);
+    TLI->emitTrailingFence(Builder, CI, FailureOrder);
   Builder.CreateBr(ExitBB);
 
   // Finally, we have control-flow based knowledge of whether the cmpxchg
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 26da748fa244..55a27e2fb79e 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_library(LLVMCodeGen
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
   ExpandPostRAPseudos.cpp
+  ExpandReductions.cpp
   FaultMaps.cpp
   FEntryInserter.cpp
   FuncletLayout.cpp
@@ -48,6 +49,7 @@ add_llvm_library(LLVMCodeGen
   LivePhysRegs.cpp
   LiveRangeCalc.cpp
   LiveRangeEdit.cpp
+  LiveRangeShrink.cpp
   LiveRegMatrix.cpp
   LiveRegUnits.cpp
   LiveStackAnalysis.cpp
@@ -118,6 +120,7 @@ add_llvm_library(LLVMCodeGen
   SafeStack.cpp
   SafeStackColoring.cpp
   SafeStackLayout.cpp
+  ScalarizeMaskedMemIntrin.cpp
   ScheduleDAG.cpp
   ScheduleDAGInstrs.cpp
   ScheduleDAGPrinter.cpp
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 3fc12ccc3b60..4d30c6574b12 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -43,6 +43,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeLiveDebugValuesPass(Registry);
   initializeLiveDebugVariablesPass(Registry);
   initializeLiveIntervalsPass(Registry);
+  initializeLiveRangeShrinkPass(Registry);
   initializeLiveStacksPass(Registry);
   initializeLiveVariablesPass(Registry);
   initializeLocalStackSlotPassPass(Registry);
@@ -79,7 +80,8 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeRAGreedyPass(Registry);
   initializeRegisterCoalescerPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
-  initializeSafeStackPass(Registry);
+  initializeSafeStackLegacyPassPass(Registry);
+  initializeScalarizeMaskedMemIntrinPass(Registry);
   initializeShrinkWrapPass(Registry);
   initializeSlotIndexesPass(Registry);
   initializeStackColoringPass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index c6c93811a0f9..f2e024c5e3bd 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -295,7 +295,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     if (PSI->isFunctionHotInCallGraph(&F))
       F.setSectionPrefix(".hot");
     else if (PSI->isFunctionColdInCallGraph(&F))
-      F.setSectionPrefix(".cold");
+      F.setSectionPrefix(".unlikely");
   }
 
   /// This optimization identifies DIV instructions that can be
@@ -1549,519 +1549,6 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
   return MadeChange;
 }
 
-// Translate a masked load intrinsic like
-// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
-//                               <16 x i1> %mask, <16 x i32> %passthru)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-//  %1 = bitcast i8* %addr to i32*
-//  %2 = extractelement <16 x i1> %mask, i32 0
-//  %3 = icmp eq i1 %2, true
-//  br i1 %3, label %cond.load, label %else
-//
-//cond.load:                                        ; preds = %0
-//  %4 = getelementptr i32* %1, i32 0
-//  %5 = load i32* %4
-//  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
-//  br label %else
-//
-//else:                                             ; preds = %0, %cond.load
-//  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
-//  %7 = extractelement <16 x i1> %mask, i32 1
-//  %8 = icmp eq i1 %7, true
-//  br i1 %8, label %cond.load1, label %else2
-//
-//cond.load1:                                       ; preds = %else
-//  %9 = getelementptr i32* %1, i32 1
-//  %10 = load i32* %9
-//  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
-//  br label %else2
-//
-//else2:                                            ; preds = %else, %cond.load1
-//  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-//  %12 = extractelement <16 x i1> %mask, i32 2
-//  %13 = icmp eq i1 %12, true
-//  br i1 %13, label %cond.load4, label %else5
-//
-static void scalarizeMaskedLoad(CallInst *CI) {
-  Value *Ptr  = CI->getArgOperand(0);
-  Value *Alignment = CI->getArgOperand(1);
-  Value *Mask = CI->getArgOperand(2);
-  Value *Src0 = CI->getArgOperand(3);
-
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
-  assert(VecType && "Unexpected return type of masked load intrinsic");
-
-  Type *EltTy = CI->getType()->getVectorElementType();
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  BasicBlock *CondBlock = nullptr;
-  BasicBlock *PrevIfBlock = CI->getParent();
-
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  // Short-cut if the mask is all-true.
-  bool IsAllOnesMask = isa<Constant>(Mask) &&
-    cast<Constant>(Mask)->isAllOnesValue();
-
-  if (IsAllOnesMask) {
-    Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
-    CI->replaceAllUsesWith(NewI);
-    CI->eraseFromParent();
-    return;
-  }
-
-  // Adjust alignment for the scalar instruction.
-  AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8);
-  // Bitcast %addr fron i8* to EltTy*
-  Type *NewPtrType =
-    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
-  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
-  unsigned VectorWidth = VecType->getNumElements();
-
-  Value *UndefVal = UndefValue::get(VecType);
-
-  // The result vector
-  Value *VResult = UndefVal;
-
-  if (isa<ConstantVector>(Mask)) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-          continue;
-      Value *Gep =
-          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-      LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal);
-      VResult = Builder.CreateInsertElement(VResult, Load,
-                                            Builder.getInt32(Idx));
-    }
-    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
-    CI->replaceAllUsesWith(NewI);
-    CI->eraseFromParent();
-    return;
-  }
-
-  PHINode *Phi = nullptr;
-  Value *PrevPhi = UndefVal;
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
-    //  %to_load = icmp eq i1 %mask_1, true
-    //  br i1 %to_load, label %cond.load, label %else
-    //
-    if (Idx > 0) {
-      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
-      Phi->addIncoming(VResult, CondBlock);
-      Phi->addIncoming(PrevPhi, PrevIfBlock);
-      PrevPhi = Phi;
-      VResult = Phi;
-    }
-
-    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
-    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                                    ConstantInt::get(Predicate->getType(), 1));
-
-    // Create "cond" block
-    //
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %Elt = load i32* %EltAddr
-    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
-    //
-    CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *Gep =
-        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-    LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
-    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock =
-        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    PrevIfBlock = IfBlock;
-    IfBlock = NewIfBlock;
-  }
-
-  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
-  Phi->addIncoming(VResult, CondBlock);
-  Phi->addIncoming(PrevPhi, PrevIfBlock);
-  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
-  CI->replaceAllUsesWith(NewI);
-  CI->eraseFromParent();
-}
-
-// Translate a masked store intrinsic, like
-// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
-//                               <16 x i1> %mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set
-//
-//   %1 = bitcast i8* %addr to i32*
-//   %2 = extractelement <16 x i1> %mask, i32 0
-//   %3 = icmp eq i1 %2, true
-//   br i1 %3, label %cond.store, label %else
-//
-// cond.store:                                       ; preds = %0
-//   %4 = extractelement <16 x i32> %val, i32 0
-//   %5 = getelementptr i32* %1, i32 0
-//   store i32 %4, i32* %5
-//   br label %else
-//
-// else:                                             ; preds = %0, %cond.store
-//   %6 = extractelement <16 x i1> %mask, i32 1
-//   %7 = icmp eq i1 %6, true
-//   br i1 %7, label %cond.store1, label %else2
-//
-// cond.store1:                                      ; preds = %else
-//   %8 = extractelement <16 x i32> %val, i32 1
-//   %9 = getelementptr i32* %1, i32 1
-//   store i32 %8, i32* %9
-//   br label %else2
-//   . . .
-static void scalarizeMaskedStore(CallInst *CI) {
-  Value *Src = CI->getArgOperand(0);
-  Value *Ptr  = CI->getArgOperand(1);
-  Value *Alignment = CI->getArgOperand(2);
-  Value *Mask = CI->getArgOperand(3);
-
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  VectorType *VecType = dyn_cast<VectorType>(Src->getType());
-  assert(VecType && "Unexpected data type in masked store intrinsic");
-
-  Type *EltTy = VecType->getElementType();
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  // Short-cut if the mask is all-true.
-  bool IsAllOnesMask = isa<Constant>(Mask) &&
-    cast<Constant>(Mask)->isAllOnesValue();
-
-  if (IsAllOnesMask) {
-    Builder.CreateAlignedStore(Src, Ptr, AlignVal);
-    CI->eraseFromParent();
-    return;
-  }
-
-  // Adjust alignment for the scalar instruction.
-  AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8);
-  // Bitcast %addr fron i8* to EltTy*
-  Type *NewPtrType =
-    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
-  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
-  unsigned VectorWidth = VecType->getNumElements();
-
-  if (isa<ConstantVector>(Mask)) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-          continue;
-      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
-      Value *Gep =
-          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-      Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
-    }
-    CI->eraseFromParent();
-    return;
-  }
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
-    //  %to_store = icmp eq i1 %mask_1, true
-    //  br i1 %to_store, label %cond.store, label %else
-    //
-    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
-    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                                    ConstantInt::get(Predicate->getType(), 1));
-
-    // Create "cond" block
-    //
-    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %store i32 %OneElt, i32* %EltAddr
-    //
-    BasicBlock *CondBlock =
-        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
-    Value *Gep =
-        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-    Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock =
-        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    IfBlock = NewIfBlock;
-  }
-  CI->eraseFromParent();
-}
-
-// Translate a masked gather intrinsic like
-// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
-//                               <16 x i1> %Mask, <16 x i32> %Src)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
-// % Mask0 = extractelement <16 x i1> %Mask, i32 0
-// % ToLoad0 = icmp eq i1 % Mask0, true
-// br i1 % ToLoad0, label %cond.load, label %else
-//
-// cond.load:
-// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// % Load0 = load i32, i32* % Ptr0, align 4
-// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
-// br label %else
-//
-// else:
-// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
-// % Mask1 = extractelement <16 x i1> %Mask, i32 1
-// % ToLoad1 = icmp eq i1 % Mask1, true
-// br i1 % ToLoad1, label %cond.load1, label %else2
-//
-// cond.load1:
-// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// % Load1 = load i32, i32* % Ptr1, align 4
-// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
-// br label %else2
-// . . .
-// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
-// ret <16 x i32> %Result
-static void scalarizeMaskedGather(CallInst *CI) {
-  Value *Ptrs = CI->getArgOperand(0);
-  Value *Alignment = CI->getArgOperand(1);
-  Value *Mask = CI->getArgOperand(2);
-  Value *Src0 = CI->getArgOperand(3);
-
-  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
-
-  assert(VecType && "Unexpected return type of masked load intrinsic");
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  BasicBlock *CondBlock = nullptr;
-  BasicBlock *PrevIfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  Value *UndefVal = UndefValue::get(VecType);
-
-  // The result vector
-  Value *VResult = UndefVal;
-  unsigned VectorWidth = VecType->getNumElements();
-
-  // Shorten the way if the mask is a vector of constants.
-  bool IsConstMask = isa<ConstantVector>(Mask);
-
-  if (IsConstMask) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-        continue;
-      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                                "Ptr" + Twine(Idx));
-      LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
-                                                 "Load" + Twine(Idx));
-      VResult = Builder.CreateInsertElement(VResult, Load,
-                                            Builder.getInt32(Idx),
-                                            "Res" + Twine(Idx));
-    }
-    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
-    CI->replaceAllUsesWith(NewI);
-    CI->eraseFromParent();
-    return;
-  }
-
-  PHINode *Phi = nullptr;
-  Value *PrevPhi = UndefVal;
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
-    //  %ToLoad1 = icmp eq i1 %Mask1, true
-    //  br i1 %ToLoad1, label %cond.load, label %else
-    //
-    if (Idx > 0) {
-      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
-      Phi->addIncoming(VResult, CondBlock);
-      Phi->addIncoming(PrevPhi, PrevIfBlock);
-      PrevPhi = Phi;
-      VResult = Phi;
-    }
-
-    Value *Predicate = Builder.CreateExtractElement(Mask,
-                                                    Builder.getInt32(Idx),
-                                                    "Mask" + Twine(Idx));
-    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                                    ConstantInt::get(Predicate->getType(), 1),
-                                    "ToLoad" + Twine(Idx));
-
-    // Create "cond" block
-    //
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %Elt = load i32* %EltAddr
-    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
-    //
-    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                              "Ptr" + Twine(Idx));
-    LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
-                                               "Load" + Twine(Idx));
-    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
-                                          "Res" + Twine(Idx));
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    PrevIfBlock = IfBlock;
-    IfBlock = NewIfBlock;
-  }
-
-  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
-  Phi->addIncoming(VResult, CondBlock);
-  Phi->addIncoming(PrevPhi, PrevIfBlock);
-  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
-  CI->replaceAllUsesWith(NewI);
-  CI->eraseFromParent();
-}
-
-// Translate a masked scatter intrinsic, like
-// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
-//                                  <16 x i1> %Mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set.
-//
-// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
-// % Mask0 = extractelement <16 x i1> % Mask, i32 0
-// % ToStore0 = icmp eq i1 % Mask0, true
-// br i1 %ToStore0, label %cond.store, label %else
-//
-// cond.store:
-// % Elt0 = extractelement <16 x i32> %Src, i32 0
-// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// store i32 %Elt0, i32* % Ptr0, align 4
-// br label %else
-//
-// else:
-// % Mask1 = extractelement <16 x i1> % Mask, i32 1
-// % ToStore1 = icmp eq i1 % Mask1, true
-// br i1 % ToStore1, label %cond.store1, label %else2
-//
-// cond.store1:
-// % Elt1 = extractelement <16 x i32> %Src, i32 1
-// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// store i32 % Elt1, i32* % Ptr1, align 4
-// br label %else2
-//   . . .
-static void scalarizeMaskedScatter(CallInst *CI) {
-  Value *Src = CI->getArgOperand(0);
-  Value *Ptrs = CI->getArgOperand(1);
-  Value *Alignment = CI->getArgOperand(2);
-  Value *Mask = CI->getArgOperand(3);
-
-  assert(isa<VectorType>(Src->getType()) &&
-         "Unexpected data type in masked scatter intrinsic");
-  assert(isa<VectorType>(Ptrs->getType()) &&
-         isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
-         "Vector of pointers is expected in masked scatter intrinsic");
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  unsigned VectorWidth = Src->getType()->getVectorNumElements();
-
-  // Shorten the way if the mask is a vector of constants.
-  bool IsConstMask = isa<ConstantVector>(Mask);
-
-  if (IsConstMask) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-        continue;
-      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
-                                                   "Elt" + Twine(Idx));
-      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                                "Ptr" + Twine(Idx));
-      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
-    }
-    CI->eraseFromParent();
-    return;
-  }
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
-    //  % ToStore = icmp eq i1 % Mask1, true
-    //  br i1 % ToStore, label %cond.store, label %else
-    //
-    Value *Predicate = Builder.CreateExtractElement(Mask,
-                                                    Builder.getInt32(Idx),
-                                                    "Mask" + Twine(Idx));
-    Value *Cmp =
-       Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                          ConstantInt::get(Predicate->getType(), 1),
-                          "ToStore" + Twine(Idx));
-
-    // Create "cond" block
-    //
-    //  % Elt1 = extractelement <16 x i32> %Src, i32 1
-    //  % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-    //  %store i32 % Elt1, i32* % Ptr1
-    //
-    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
-                                                 "Elt" + Twine(Idx));
-    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                              "Ptr" + Twine(Idx));
-    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    IfBlock = NewIfBlock;
-  }
-  CI->eraseFromParent();
-}
-
 /// If counting leading or trailing zeros is an expensive operation and a zero
 /// input is defined, add a check for zero to avoid calling the intrinsic.
 ///
@@ -2242,39 +1729,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       }
       return true;
     }
-    case Intrinsic::masked_load: {
-      // Scalarize unsupported vector masked load
-      if (!TTI->isLegalMaskedLoad(CI->getType())) {
-        scalarizeMaskedLoad(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
-    case Intrinsic::masked_store: {
-      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
-        scalarizeMaskedStore(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
-    case Intrinsic::masked_gather: {
-      if (!TTI->isLegalMaskedGather(CI->getType())) {
-        scalarizeMaskedGather(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
-    case Intrinsic::masked_scatter: {
-      if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
-        scalarizeMaskedScatter(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
     case Intrinsic::aarch64_stlxr:
     case Intrinsic::aarch64_stxr: {
       ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index ab2382e2db6d..e860906043dd 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -142,8 +142,9 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
   MachineOperand &DstMO = MI->getOperand(0);
   MachineOperand &SrcMO = MI->getOperand(1);
 
-  if (SrcMO.getReg() == DstMO.getReg()) {
-    DEBUG(dbgs() << "identity copy: " << *MI);
+  bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg());
+  if (IdentityCopy || SrcMO.isUndef()) {
+    DEBUG(dbgs() << (IdentityCopy ? "identity copy: " : "undef copy:    ") << *MI);
     // No need to insert an identity copy instruction, but replace with a KILL
     // if liveness is changed.
     if (SrcMO.isUndef() || MI->getNumOperands() > 2) {
diff --git a/lib/CodeGen/ExpandReductions.cpp b/lib/CodeGen/ExpandReductions.cpp
new file mode 100644
index 000000000000..a40ea28056dd
--- /dev/null
+++ b/lib/CodeGen/ExpandReductions.cpp
@@ -0,0 +1,167 @@
+//===--- ExpandReductions.cpp - Expand experimental reduction intrinsics --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for reduction intrinsics, allowing targets
+// to enable the experimental intrinsics until just before codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ExpandReductions.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+unsigned getOpcode(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::experimental_vector_reduce_fadd:
+    return Instruction::FAdd;
+  case Intrinsic::experimental_vector_reduce_fmul:
+    return Instruction::FMul;
+  case Intrinsic::experimental_vector_reduce_add:
+    return Instruction::Add;
+  case Intrinsic::experimental_vector_reduce_mul:
+    return Instruction::Mul;
+  case Intrinsic::experimental_vector_reduce_and:
+    return Instruction::And;
+  case Intrinsic::experimental_vector_reduce_or:
+    return Instruction::Or;
+  case Intrinsic::experimental_vector_reduce_xor:
+    return Instruction::Xor;
+  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::experimental_vector_reduce_umin:
+    return Instruction::ICmp;
+  case Intrinsic::experimental_vector_reduce_fmax:
+  case Intrinsic::experimental_vector_reduce_fmin:
+    return Instruction::FCmp;
+  default:
+    llvm_unreachable("Unexpected ID");
+  }
+}
+
+RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::experimental_vector_reduce_smax:
+    return RecurrenceDescriptor::MRK_SIntMax;
+  case Intrinsic::experimental_vector_reduce_smin:
+    return RecurrenceDescriptor::MRK_SIntMin;
+  case Intrinsic::experimental_vector_reduce_umax:
+    return RecurrenceDescriptor::MRK_UIntMax;
+  case Intrinsic::experimental_vector_reduce_umin:
+    return RecurrenceDescriptor::MRK_UIntMin;
+  case Intrinsic::experimental_vector_reduce_fmax:
+    return RecurrenceDescriptor::MRK_FloatMax;
+  case Intrinsic::experimental_vector_reduce_fmin:
+    return RecurrenceDescriptor::MRK_FloatMin;
+  default:
+    return RecurrenceDescriptor::MRK_Invalid;
+  }
+}
+
+bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
+  bool Changed = false;
+  SmallVector<IntrinsicInst*, 4> Worklist;
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (auto II = dyn_cast<IntrinsicInst>(&*I))
+      Worklist.push_back(II);
+
+  for (auto *II : Worklist) {
+    IRBuilder<> Builder(II);
+    Value *Vec = nullptr;
+    auto ID = II->getIntrinsicID();
+    auto MRK = RecurrenceDescriptor::MRK_Invalid;
+    switch (ID) {
+    case Intrinsic::experimental_vector_reduce_fadd:
+    case Intrinsic::experimental_vector_reduce_fmul:
+      // FMFs must be attached to the call, otherwise it's an ordered reduction
+      // and it can't be handled by generating this shuffle sequence.
+      // TODO: Implement scalarization of ordered reductions here for targets
+      // without native support.
+      if (!II->getFastMathFlags().unsafeAlgebra())
+        continue;
+      Vec = II->getArgOperand(1);
+      break;
+    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::experimental_vector_reduce_fmax:
+    case Intrinsic::experimental_vector_reduce_fmin:
+      Vec = II->getArgOperand(0);
+      MRK = getMRK(ID);
+      break;
+    default:
+      continue;
+    }
+    if (!TTI->shouldExpandReduction(II))
+      continue;
+    auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
+    II->replaceAllUsesWith(Rdx);
+    II->eraseFromParent();
+    Changed = true;
+  }
+  return Changed;
+}
+
+class ExpandReductions : public FunctionPass {
+public:
+  static char ID;
+  ExpandReductions() : FunctionPass(ID) {
+    initializeExpandReductionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return expandReductions(F, TTI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+}
+
+char ExpandReductions::ID;
+INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions",
+                      "Expand reduction intrinsics", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ExpandReductions, "expand-reductions",
+                    "Expand reduction intrinsics", false, false)
+
+FunctionPass *llvm::createExpandReductionsPass() {
+  return new ExpandReductions();
+}
+
+PreservedAnalyses ExpandReductionsPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!expandReductions(F, &TTI))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index eaf4056e47ea..4d4591042296 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -162,7 +162,7 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI,
   return std::get<0>(getAction(MI, MRI)) == Legal;
 }
 
-LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
+Optional<LLT> LegalizerInfo::findLegalType(const InstrAspect &Aspect,
                                  LegalizeAction Action) const {
   switch(Action) {
   default:
@@ -174,20 +174,20 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
     return Aspect.Type;
   case NarrowScalar: {
     return findLegalType(Aspect,
-                         [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
+                         [](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
   }
   case WidenScalar: {
-    return findLegalType(Aspect, [&](LLT Ty) -> LLT {
+    return findLegalType(Aspect, [](LLT Ty) -> LLT {
       return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize();
     });
   }
   case FewerElements: {
     return findLegalType(Aspect,
-                         [&](LLT Ty) -> LLT { return Ty.halfElements(); });
+                         [](LLT Ty) -> LLT { return Ty.halfElements(); });
   }
   case MoreElements: {
     return findLegalType(Aspect,
-                         [&](LLT Ty) -> LLT { return Ty.doubleElements(); });
+                         [](LLT Ty) -> LLT { return Ty.doubleElements(); });
   }
   }
 }
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 7248f50945d0..2eb3cdee694d 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -204,12 +204,8 @@ uint64_t RegBankSelect::getRepairCost(
     // TODO: use a dedicated constant for ImpossibleCost.
     if (Cost != UINT_MAX)
       return Cost;
-    assert(!TPC->isGlobalISelAbortEnabled() &&
-           "Legalization not available yet");
     // Return the legalization cost of that repairing.
   }
-  assert(!TPC->isGlobalISelAbortEnabled() &&
-         "Complex repairing not implemented yet");
   return UINT_MAX;
 }
 
@@ -452,6 +448,11 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
 
     // Sums up the repairing cost of MO at each insertion point.
     uint64_t RepairCost = getRepairCost(MO, ValMapping);
+
+    // This is an impossible to repair cost.
+    if (RepairCost == UINT_MAX)
+      continue;
+
     // Bias used for splitting: 5%.
     const uint64_t PercentageForBias = 5;
     uint64_t Bias = (RepairCost * PercentageForBias + 99) / 100;
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index 3c93f8123b0d..254bdf10d804 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -110,3 +110,11 @@ Optional<int64_t> llvm::getConstantVRegVal(unsigned VReg,
 
   return None;
 }
+
+const llvm::ConstantFP* llvm::getConstantFPVRegVal(unsigned VReg,
+                                       const MachineRegisterInfo &MRI) {
+  MachineInstr *MI = MRI.getVRegDef(VReg);
+  if (TargetOpcode::G_FCONSTANT != MI->getOpcode())
+    return nullptr;
+  return MI->getOperand(1).getFPImm();
+}
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 37fe41582333..628d599a3cc7 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1318,7 +1318,8 @@ static bool canFallThroughTo(MachineBasicBlock &MBB, MachineBasicBlock &ToMBB) {
       return false;
     PI = I++;
   }
-  return true;
+  // Finally see if the last I is indeed a successor to PI.
+  return PI->isSuccessor(&*I);
 }
 
 /// Invalidate predecessor BB info so it would be re-analyzed to determine if it
@@ -1587,22 +1588,32 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     BBCvt = MBPI->getEdgeProbability(BBI.BB, &CvtMBB);
   }
 
+  // To be able to insert code freely at the end of BBI we sometimes remove
+  // the branch from BBI to NextMBB temporarily. Remember if this happened.
+  bool RemovedBranchToNextMBB = false;
   if (CvtMBB.pred_size() > 1) {
     BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
     CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
 
-    // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
-    // explicitly remove CvtBBI as a successor.
+    // Keep the CFG updated.
     BBI.BB->removeSuccessor(&CvtMBB, true);
   } else {
     // Predicate the 'true' block after removing its branch.
     CvtBBI->NonPredSize -= TII->removeBranch(CvtMBB);
     PredicateBlock(*CvtBBI, CvtMBB.end(), Cond);
 
-    // Now merge the entry of the triangle with the true block.
+    // Remove the branch from the entry of the triangle to NextBB to be able to
+    // do the merge below. Keep the CFG updated, but remember we removed the
+    // branch since we do want to execute NextMBB, either by introducing a
+    // branch to it again, or merging it into the entry block.
+    // How it's handled is decided further down.
     BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
+    BBI.BB->removeSuccessor(&NextMBB, true);
+    RemovedBranchToNextMBB = true;
+
+    // Now merge the entry of the triangle with the true block.
     MergeBlocks(BBI, *CvtBBI, false);
   }
 
@@ -1640,12 +1651,19 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     // block. By not merging them, we make it possible to iteratively
     // ifcvt the blocks.
     if (!HasEarlyExit &&
-        NextMBB.pred_size() == 1 && !NextBBI->HasFallThrough &&
+        // We might have removed BBI from NextMBB's predecessor list above but
+        // we want it to be there, so consider that too.
+        (NextMBB.pred_size() == (RemovedBranchToNextMBB ? 0 : 1)) &&
+        !NextBBI->HasFallThrough &&
         !NextMBB.hasAddressTaken()) {
+      // We will merge NextBBI into BBI, and thus remove the current
+      // fallthrough from BBI into CvtBBI.
+      BBI.BB->removeSuccessor(&CvtMBB, true);
       MergeBlocks(BBI, *NextBBI);
       FalseBBDead = true;
     } else {
       InsertUncondBranch(*BBI.BB, NextMBB, TII);
+      BBI.BB->addSuccessor(&NextMBB);
       BBI.HasFallThrough = false;
     }
     // Mixed predicated and unpredicated code. This cannot be iteratively
@@ -1653,8 +1671,6 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     IterIfcvt = false;
   }
 
-  RemoveExtraEdges(BBI);
-
   // Update block info. BB can be iteratively if-converted.
   if (!IterIfcvt)
     BBI.IsDone = true;
diff --git a/lib/CodeGen/LiveRangeShrink.cpp b/lib/CodeGen/LiveRangeShrink.cpp
new file mode 100644
index 000000000000..00182e2c779f
--- /dev/null
+++ b/lib/CodeGen/LiveRangeShrink.cpp
@@ -0,0 +1,211 @@
+//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+///===---------------------------------------------------------------------===//
+///
+/// \file
+/// This pass moves instructions close to the definition of its operands to
+/// shrink live range of the def instruction. The code motion is limited within
+/// the basic block. The moved instruction should have 1 def, and more than one
+/// uses, all of which are the only use of the def.
+///
+///===---------------------------------------------------------------------===//
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "lrshrink"
+
+STATISTIC(NumInstrsHoistedToShrinkLiveRange,
+          "Number of insructions hoisted to shrink live range.");
+
+using namespace llvm;
+
+namespace {
+class LiveRangeShrink : public MachineFunctionPass {
+public:
+  static char ID;
+
+  LiveRangeShrink() : MachineFunctionPass(ID) {
+    initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Live Range Shrink"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char LiveRangeShrink::ID = 0;
+char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID;
+
+INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false,
+                false)
+namespace {
+typedef DenseMap<MachineInstr *, unsigned> InstOrderMap;
+
+/// Returns \p New if it's dominated by \p Old, otherwise return \p Old.
+/// \p M maintains a map from instruction to its dominating order that satisfies
+/// M[A] > M[B] guarantees that A is dominated by B.
+/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return
+/// \p New.
+MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old,
+                                       const InstOrderMap &M) {
+  auto NewIter = M.find(&New);
+  if (NewIter == M.end())
+    return Old;
+  if (Old == nullptr)
+    return &New;
+  unsigned OrderOld = M.find(Old)->second;
+  unsigned OrderNew = NewIter->second;
+  if (OrderOld != OrderNew)
+    return OrderOld < OrderNew ? &New : Old;
+  // OrderOld == OrderNew, we need to iterate down from Old to see if it
+  // can reach New, if yes, New is dominated by Old.
+  for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew;
+       I = I->getNextNode())
+    if (I == &New)
+      return &New;
+  return Old;
+}
+
+/// Builds Instruction to its dominating order number map \p M by traversing
+/// from instruction \p Start.
+void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) {
+  M.clear();
+  unsigned i = 0;
+  for (MachineInstr &I : make_range(Start, Start->getParent()->end()))
+    M[&I] = i++;
+}
+} // end anonymous namespace
+
+bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
+
+  InstOrderMap IOM;
+  // Map from register to instruction order (value of IOM) where the
+  // register is used last. When moving instructions up, we need to
+  // make sure all its defs (including dead def) will not cross its
+  // last use when moving up.
+  DenseMap<unsigned, unsigned> UseMap;
+
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.empty())
+      continue;
+    bool SawStore = false;
+    BuildInstOrderMap(MBB.begin(), IOM);
+    UseMap.clear();
+
+    for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) {
+      MachineInstr &MI = *Next;
+      ++Next;
+      if (MI.isPHI() || MI.isDebugValue())
+        continue;
+      if (MI.mayStore())
+        SawStore = true;
+
+      unsigned CurrentOrder = IOM[&MI];
+      unsigned Barrier = 0;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || MO.isDebug())
+          continue;
+        if (MO.isUse())
+          UseMap[MO.getReg()] = CurrentOrder;
+        else if (MO.isDead() && UseMap.count(MO.getReg()))
+          // Barrier is the last instruction where MO get used. MI should not
+          // be moved above Barrier.
+          Barrier = std::max(Barrier, UseMap[MO.getReg()]);
+      }
+
+      if (!MI.isSafeToMove(nullptr, SawStore)) {
+        // If MI has side effects, it should become a barrier for code motion.
+        // IOM is rebuild from the next instruction to prevent later
+        // instructions from being moved before this MI.
+        if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
+          BuildInstOrderMap(Next, IOM);
+          SawStore = false;
+        }
+        continue;
+      }
+
+      const MachineOperand *DefMO = nullptr;
+      MachineInstr *Insert = nullptr;
+
+      // Number of live-ranges that will be shortened. We do not count
+      // live-ranges that are defined by a COPY as it could be coalesced later.
+      unsigned NumEligibleUse = 0;
+
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || MO.isDead() || MO.isDebug())
+          continue;
+        unsigned Reg = MO.getReg();
+        // Do not move the instruction if it def/uses a physical register,
+        // unless it is a constant physical register.
+        if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+            !MRI.isConstantPhysReg(Reg)) {
+          Insert = nullptr;
+          break;
+        }
+        if (MO.isDef()) {
+          // Do not move if there is more than one def.
+          if (DefMO) {
+            Insert = nullptr;
+            break;
+          }
+          DefMO = &MO;
+        } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg)) {
+          MachineInstr &DefInstr = *MRI.def_instr_begin(Reg);
+          if (!DefInstr.isCopy())
+            NumEligibleUse++;
+          Insert = FindDominatedInstruction(DefInstr, Insert, IOM);
+        } else {
+          Insert = nullptr;
+          break;
+        }
+      }
+      // Move the instruction when # of shrunk live range > 1.
+      if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) {
+        MachineBasicBlock::iterator I = std::next(Insert->getIterator());
+        // Skip all the PHI and debug instructions.
+        while (I != MBB.end() && (I->isPHI() || I->isDebugValue()))
+          I = std::next(I);
+        if (I == MI.getIterator())
+          continue;
+
+        // Update the dominator order to be the same as the insertion point.
+        // We do this to maintain a non-decreasing order without need to update
+        // all instruction orders after the insertion point.
+        unsigned NewOrder = IOM[&*I];
+        IOM[&MI] = NewOrder;
+        NumInstrsHoistedToShrinkLiveRange++;
+
+        // Find MI's debug value following MI.
+        MachineBasicBlock::iterator EndIter = std::next(MI.getIterator());
+        if (MI.getOperand(0).isReg())
+          for (; EndIter != MBB.end() && EndIter->isDebugValue() &&
+                 EndIter->getOperand(0).isReg() &&
+                 EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg();
+               ++EndIter, ++Next)
+            IOM[&*EndIter] = NewOrder;
+        MBB.splice(I, &MBB, MI.getIterator(), EndIter);
+      }
+    }
+  }
+  return false;
+}
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 3568b0294ad9..a9aec926115a 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -767,7 +767,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
                                 MachineBasicBlock *SuccBB) {
   const unsigned NumNew = BB->getNumber();
 
-  SmallSet<unsigned, 16> Defs, Kills;
+  DenseSet<unsigned> Defs, Kills;
 
   MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end();
   for (; BBI != BBE && BBI->isPHI(); ++BBI) {
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 4cfc128a8c1d..5003115a770f 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -133,6 +133,14 @@ static cl::opt<unsigned> TailDupPlacementThreshold(
              "that won't conflict."), cl::init(2),
     cl::Hidden);
 
+// Heuristic for aggressive tail duplication.
+static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
+    "tail-dup-placement-aggressive-threshold",
+    cl::desc("Instruction cutoff for aggressive tail duplication during "
+             "layout. Used at -O3. Tail merging during layout is forced to "
+             "have a threshold that won't conflict."), cl::init(3),
+    cl::Hidden);
+
 // Heuristic for tail duplication.
 static cl::opt<unsigned> TailDupPlacementPenalty(
     "tail-dup-placement-penalty",
@@ -2646,9 +2654,26 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   assert(BlockToChain.empty());
   assert(ComputedEdges.empty());
 
+  unsigned TailDupSize = TailDupPlacementThreshold;
+  // If only the aggressive threshold is explicitly set, use it.
+  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
+      TailDupPlacementThreshold.getNumOccurrences() == 0)
+    TailDupSize = TailDupPlacementAggressiveThreshold;
+
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+  // For agressive optimization, we can adjust some thresholds to be less
+  // conservative.
+  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
+    // At O3 we should be more willing to copy blocks for tail duplication. This
+    // increases size pressure, so we only do it at O3
+    // Do this unless only the regular threshold is explicitly set.
+    if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
+        TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
+      TailDupSize = TailDupPlacementAggressiveThreshold;
+  }
+
   if (TailDupPlacement) {
     MPDT = &getAnalysis<MachinePostDominatorTree>();
-    unsigned TailDupSize = TailDupPlacementThreshold;
     if (MF.getFunction()->optForSize())
       TailDupSize = 1;
     TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
@@ -2658,7 +2683,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   buildCFGChains();
 
   // Changing the layout can create new tail merging opportunities.
-  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
   // TailMerge can create jump into if branches that make CFG irreducible for
   // HW that requires structured CFG.
   bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
@@ -2666,7 +2690,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
                          BranchFoldPlacement;
   // No tail merging opportunities if the block number is less than four.
   if (MF.size() > 3 && EnableTailMerge) {
-    unsigned TailMergeSize = TailDupPlacementThreshold + 1;
+    unsigned TailMergeSize = TailDupSize + 1;
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
                     *MBPI, TailMergeSize);
 
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index bfb2cde030dc..ab433273b189 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -2063,12 +2063,12 @@ void MachineVerifier::verifyStackFrame() {
       if (I.getOpcode() == FrameSetupOpcode) {
         if (BBState.ExitIsSetup)
           report("FrameSetup is after another FrameSetup", &I);
-        BBState.ExitValue -= TII->getFrameSize(I);
+        BBState.ExitValue -= TII->getFrameTotalSize(I);
         BBState.ExitIsSetup = true;
       }
 
       if (I.getOpcode() == FrameDestroyOpcode) {
-        int Size = TII->getFrameSize(I);
+        int Size = TII->getFrameTotalSize(I);
         if (!BBState.ExitIsSetup)
           report("FrameDestroy is not after a FrameSetup", &I);
         int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue :
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index c67a25b888bf..db2264b2439d 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -34,7 +34,7 @@
 #include <algorithm>
 using namespace llvm;
 
-#define DEBUG_TYPE "phielim"
+#define DEBUG_TYPE "phi-node-elimination"
 
 static cl::opt<bool>
 DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false),
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index bf44ee8453b6..1803ea2b9249 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -3214,7 +3214,7 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
     CurrList(WorkList.begin() + PrevSize, WorkList.end());
   if (copyCoalesceWorkList(CurrList))
     WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
-                               (MachineInstr*)nullptr), WorkList.end());
+                               nullptr), WorkList.end());
 }
 
 void RegisterCoalescer::coalesceLocals() {
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 35db30f89976..0635e5c0a63c 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -62,10 +62,9 @@ void RegScavenger::init(MachineBasicBlock &MBB) {
   }
   this->MBB = &MBB;
 
-  for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(),
-         IE = Scavenged.end(); I != IE; ++I) {
-    I->Reg = 0;
-    I->Restore = nullptr;
+  for (ScavengedInfo &SI : Scavenged) {
+    SI.Reg = 0;
+    SI.Restore = nullptr;
   }
 
   Tracking = false;
diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index 7fa379d80c6c..08b3d345f689 100644
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -19,6 +19,7 @@
 #include "SafeStackLayout.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -92,11 +93,11 @@ public:
 /// determined statically), and the unsafe stack, which contains all
 /// local variables that are accessed in ways that we can't prove to
 /// be safe.
-class SafeStack : public FunctionPass {
-  const TargetMachine *TM;
-  const TargetLoweringBase *TL;
-  const DataLayout *DL;
-  ScalarEvolution *SE;
+class SafeStack {
+  Function &F;
+  const TargetLoweringBase &TL;
+  const DataLayout &DL;
+  ScalarEvolution &SE;
 
   Type *StackPtrTy;
   Type *IntPtrTy;
@@ -171,33 +172,21 @@ class SafeStack : public FunctionPass {
                     uint64_t AllocaSize);
 
 public:
-  static char ID; // Pass identification, replacement for typeid.
-  SafeStack(const TargetMachine *TM)
-      : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) {
-    initializeSafeStackPass(*PassRegistry::getPassRegistry());
-  }
-  SafeStack() : SafeStack(nullptr) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-  }
-
-  bool doInitialization(Module &M) override {
-    DL = &M.getDataLayout();
-
-    StackPtrTy = Type::getInt8PtrTy(M.getContext());
-    IntPtrTy = DL->getIntPtrType(M.getContext());
-    Int32Ty = Type::getInt32Ty(M.getContext());
-    Int8Ty = Type::getInt8Ty(M.getContext());
-
-    return false;
-  }
-
-  bool runOnFunction(Function &F) override;
-}; // class SafeStack
+  SafeStack(Function &F, const TargetLoweringBase &TL, const DataLayout &DL,
+            ScalarEvolution &SE)
+      : F(F), TL(TL), DL(DL), SE(SE),
+        StackPtrTy(Type::getInt8PtrTy(F.getContext())),
+        IntPtrTy(DL.getIntPtrType(F.getContext())),
+        Int32Ty(Type::getInt32Ty(F.getContext())),
+        Int8Ty(Type::getInt8Ty(F.getContext())) {}
+
+  // Run the transformation on the associated function.
+  // Returns whether the function was changed.
+  bool run();
+};
 
 uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
-  uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType());
+  uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
   if (AI->isArrayAllocation()) {
     auto C = dyn_cast<ConstantInt>(AI->getArraySize());
     if (!C)
@@ -209,11 +198,11 @@ uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
 
 bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
                              const Value *AllocaPtr, uint64_t AllocaSize) {
-  AllocaOffsetRewriter Rewriter(*SE, AllocaPtr);
-  const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr));
+  AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
+  const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
 
-  uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType());
-  ConstantRange AccessStartRange = SE->getUnsignedRange(Expr);
+  uint64_t BitWidth = SE.getTypeSizeInBits(Expr->getType());
+  ConstantRange AccessStartRange = SE.getUnsignedRange(Expr);
   ConstantRange SizeRange =
       ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize));
   ConstantRange AccessRange = AccessStartRange.add(SizeRange);
@@ -226,8 +215,8 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
                << *AllocaPtr << "\n"
                << "            Access " << *Addr << "\n"
                << "            SCEV " << *Expr
-               << " U: " << SE->getUnsignedRange(Expr)
-               << ", S: " << SE->getSignedRange(Expr) << "\n"
+               << " U: " << SE.getUnsignedRange(Expr)
+               << ", S: " << SE.getSignedRange(Expr) << "\n"
                << "            Range " << AccessRange << "\n"
                << "            AllocaRange " << AllocaRange << "\n"
                << "            " << (Safe ? "safe" : "unsafe") << "\n");
@@ -266,7 +255,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
 
       switch (I->getOpcode()) {
       case Instruction::Load: {
-        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr,
+        if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getType()), AllocaPtr,
                           AllocaSize))
           return false;
         break;
@@ -282,7 +271,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
           return false;
         }
 
-        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()),
+        if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getOperand(0)->getType()),
                           AllocaPtr, AllocaSize))
           return false;
         break;
@@ -343,7 +332,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
 }
 
 Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
-  Value *StackGuardVar = TL->getIRStackGuard(IRB);
+  Value *StackGuardVar = TL.getIRStackGuard(IRB);
   if (!StackGuardVar)
     StackGuardVar =
         F.getParent()->getOrInsertGlobal("__stack_chk_guard", StackPtrTy);
@@ -390,7 +379,7 @@ void SafeStack::findInsts(Function &F,
     if (!Arg.hasByValAttr())
       continue;
     uint64_t Size =
-        DL->getTypeStoreSize(Arg.getType()->getPointerElementType());
+        DL.getTypeStoreSize(Arg.getType()->getPointerElementType());
     if (IsSafeStackAlloca(&Arg, Size))
       continue;
 
@@ -476,19 +465,19 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
   if (StackGuardSlot) {
     Type *Ty = StackGuardSlot->getAllocatedType();
     unsigned Align =
-        std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
+        std::max(DL.getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
     SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
                   Align, SSC.getFullLiveRange());
   }
 
   for (Argument *Arg : ByValArguments) {
     Type *Ty = Arg->getType()->getPointerElementType();
-    uint64_t Size = DL->getTypeStoreSize(Ty);
+    uint64_t Size = DL.getTypeStoreSize(Ty);
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
     // Ensure the object is properly aligned.
-    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+    unsigned Align = std::max((unsigned)DL.getPrefTypeAlignment(Ty),
                               Arg->getParamAlignment());
     SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange());
   }
@@ -501,7 +490,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
 
     // Ensure the object is properly aligned.
     unsigned Align =
-        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment());
+        std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment());
 
     SSL.addObject(AI, Size, Align, SSC.getLiveRange(AI));
   }
@@ -539,7 +528,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
     unsigned Offset = SSL.getObjectOffset(Arg);
     Type *Ty = Arg->getType()->getPointerElementType();
 
-    uint64_t Size = DL->getTypeStoreSize(Ty);
+    uint64_t Size = DL.getTypeStoreSize(Ty);
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
@@ -630,7 +619,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
       ArraySize = IRB.CreateIntCast(ArraySize, IntPtrTy, false);
 
     Type *Ty = AI->getAllocatedType();
-    uint64_t TySize = DL->getTypeAllocSize(Ty);
+    uint64_t TySize = DL.getTypeAllocSize(Ty);
     Value *Size = IRB.CreateMul(ArraySize, ConstantInt::get(IntPtrTy, TySize));
 
     Value *SP = IRB.CreatePtrToInt(IRB.CreateLoad(UnsafeStackPtr), IntPtrTy);
@@ -638,7 +627,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
 
     // Align the SP value to satisfy the AllocaInst, type and stack alignments.
     unsigned Align = std::max(
-        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()),
+        std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()),
         (unsigned)StackAlignment);
 
     assert(isPowerOf2_32(Align));
@@ -685,25 +674,10 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
   }
 }
 
-bool SafeStack::runOnFunction(Function &F) {
-  DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
-
-  if (!F.hasFnAttribute(Attribute::SafeStack)) {
-    DEBUG(dbgs() << "[SafeStack]     safestack is not requested"
-                    " for this function\n");
-    return false;
-  }
-
-  if (F.isDeclaration()) {
-    DEBUG(dbgs() << "[SafeStack]     function definition"
-                    " is not available\n");
-    return false;
-  }
-
-  if (!TM)
-    report_fatal_error("Target machine is required");
-  TL = TM->getSubtargetImpl(F)->getTargetLowering();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+bool SafeStack::run() {
+  assert(F.hasFnAttribute(Attribute::SafeStack) &&
+         "Can't run SafeStack on a function without the attribute");
+  assert(!F.isDeclaration() && "Can't run SafeStack on a function declaration");
 
   ++NumFunctions;
 
@@ -736,7 +710,7 @@ bool SafeStack::runOnFunction(Function &F) {
     ++NumUnsafeStackRestorePointsFunctions;
 
   IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt());
-  UnsafeStackPtr = TL->getSafeStackPointerLocation(IRB);
+  UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB);
 
   // Load the current stack pointer (we'll also use it as a base pointer).
   // FIXME: use a dedicated register for it ?
@@ -788,14 +762,70 @@ bool SafeStack::runOnFunction(Function &F) {
   return true;
 }
 
+class SafeStackLegacyPass : public FunctionPass {
+  const TargetMachine *TM;
+
+public:
+  static char ID; // Pass identification, replacement for typeid..
+  SafeStackLegacyPass(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {
+    initializeSafeStackLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  SafeStackLegacyPass() : SafeStackLegacyPass(nullptr) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
+
+    if (!F.hasFnAttribute(Attribute::SafeStack)) {
+      DEBUG(dbgs() << "[SafeStack]     safestack is not requested"
+                      " for this function\n");
+      return false;
+    }
+
+    if (F.isDeclaration()) {
+      DEBUG(dbgs() << "[SafeStack]     function definition"
+                      " is not available\n");
+      return false;
+    }
+
+    if (!TM)
+      report_fatal_error("Target machine is required");
+    auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+    if (!TL)
+      report_fatal_error("TargetLowering instance is required");
+
+    auto *DL = &F.getParent()->getDataLayout();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &ACT = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+    // Compute DT and LI only for functions that have the attribute.
+    // This is only useful because the legacy pass manager doesn't let us
+    // compute analyzes lazily.
+    // In the backend pipeline, nothing preserves DT before SafeStack, so we
+    // would otherwise always compute it wastefully, even if there is no
+    // function with the safestack attribute.
+    DominatorTree DT(F);
+    LoopInfo LI(DT);
+
+    ScalarEvolution SE(F, TLI, ACT, DT, LI);
+
+    return SafeStack(F, *TL, *DL, SE).run();
+  }
+};
+
 } // anonymous namespace
 
-char SafeStack::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack",
+char SafeStackLegacyPass::ID = 0;
+INITIALIZE_TM_PASS_BEGIN(SafeStackLegacyPass, "safe-stack",
                          "Safe Stack instrumentation pass", false, false)
-INITIALIZE_TM_PASS_END(SafeStack, "safe-stack",
+INITIALIZE_TM_PASS_END(SafeStackLegacyPass, "safe-stack",
                        "Safe Stack instrumentation pass", false, false)
 
 FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) {
-  return new SafeStack(TM);
+  return new SafeStackLegacyPass(TM);
 }
diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
new file mode 100644
index 000000000000..dab5b91f50ad
--- /dev/null
+++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -0,0 +1,660 @@
+//=== ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem      ===//
+//===                                instrinsics                           ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces masked memory intrinsics - when unsupported by the target
+// - with a chain of basic blocks, that deal with the elements one-by-one if the
+// appropriate mask bit is set.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarize-masked-mem-intrin"
+
+namespace {
+
+class ScalarizeMaskedMemIntrin : public FunctionPass {
+  const TargetTransformInfo *TTI;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit ScalarizeMaskedMemIntrin() : FunctionPass(ID), TTI(nullptr) {
+    initializeScalarizeMaskedMemIntrinPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    return "Scalarize Masked Memory Intrinsics";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+private:
+  bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
+  bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
+};
+} // namespace
+
+char ScalarizeMaskedMemIntrin::ID = 0;
+INITIALIZE_PASS_BEGIN(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
+                      "Scalarize unsupported masked memory intrinsics", false,
+                      false)
+INITIALIZE_PASS_END(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
+                    "Scalarize unsupported masked memory intrinsics", false,
+                    false)
+
+FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() {
+  return new ScalarizeMaskedMemIntrin();
+}
+
+// Translate a masked load intrinsic like
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+//  %1 = bitcast i8* %addr to i32*
+//  %2 = extractelement <16 x i1> %mask, i32 0
+//  %3 = icmp eq i1 %2, true
+//  br i1 %3, label %cond.load, label %else
+//
+// cond.load:                                        ; preds = %0
+//  %4 = getelementptr i32* %1, i32 0
+//  %5 = load i32* %4
+//  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
+//  br label %else
+//
+// else:                                             ; preds = %0, %cond.load
+//  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
+//  %7 = extractelement <16 x i1> %mask, i32 1
+//  %8 = icmp eq i1 %7, true
+//  br i1 %8, label %cond.load1, label %else2
+//
+// cond.load1:                                       ; preds = %else
+//  %9 = getelementptr i32* %1, i32 1
+//  %10 = load i32* %9
+//  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
+//  br label %else2
+//
+// else2:                                          ; preds = %else, %cond.load1
+//  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+//  %12 = extractelement <16 x i1> %mask, i32 2
+//  %13 = icmp eq i1 %12, true
+//  br i1 %13, label %cond.load4, label %else5
+//
+static void scalarizeMaskedLoad(CallInst *CI) {
+  Value *Ptr = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  Type *EltTy = CI->getType()->getVectorElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Short-cut if the mask is all-true.
+  bool IsAllOnesMask =
+      isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue();
+
+  if (IsAllOnesMask) {
+    Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits() / 8);
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+      EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  unsigned VectorWidth = VecType->getNumElements();
+
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+
+  if (isa<ConstantVector>(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *Gep =
+          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+      LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+      VResult =
+          Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+    }
+    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_load = icmp eq i1 %mask_1, true
+    //  br i1 %to_load, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate =
+        Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Gep =
+        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+    LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+  }
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+// Translate a masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+//   %1 = bitcast i8* %addr to i32*
+//   %2 = extractelement <16 x i1> %mask, i32 0
+//   %3 = icmp eq i1 %2, true
+//   br i1 %3, label %cond.store, label %else
+//
+// cond.store:                                       ; preds = %0
+//   %4 = extractelement <16 x i32> %val, i32 0
+//   %5 = getelementptr i32* %1, i32 0
+//   store i32 %4, i32* %5
+//   br label %else
+//
+// else:                                             ; preds = %0, %cond.store
+//   %6 = extractelement <16 x i1> %mask, i32 1
+//   %7 = icmp eq i1 %6, true
+//   br i1 %7, label %cond.store1, label %else2
+//
+// cond.store1:                                      ; preds = %else
+//   %8 = extractelement <16 x i32> %val, i32 1
+//   %9 = getelementptr i32* %1, i32 1
+//   store i32 %8, i32* %9
+//   br label %else2
+//   . . .
+static void scalarizeMaskedStore(CallInst *CI) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptr = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  VectorType *VecType = dyn_cast<VectorType>(Src->getType());
+  assert(VecType && "Unexpected data type in masked store intrinsic");
+
+  Type *EltTy = VecType->getElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Short-cut if the mask is all-true.
+  bool IsAllOnesMask =
+      isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue();
+
+  if (IsAllOnesMask) {
+    Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits() / 8);
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+      EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  unsigned VectorWidth = VecType->getNumElements();
+
+  if (isa<ConstantVector>(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+      Value *Gep =
+          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+      Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_store = icmp eq i1 %mask_1, true
+    //  br i1 %to_store, label %cond.store, label %else
+    //
+    Value *Predicate =
+        Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %store i32 %OneElt, i32* %EltAddr
+    //
+    BasicBlock *CondBlock =
+        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+    Value *Gep =
+        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+    Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+}
+
+// Translate a masked gather intrinsic like
+// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
+//                               <16 x i1> %Mask, <16 x i32> %Src)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> %Mask, i32 0
+// % ToLoad0 = icmp eq i1 % Mask0, true
+// br i1 % ToLoad0, label %cond.load, label %else
+//
+// cond.load:
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// % Load0 = load i32, i32* % Ptr0, align 4
+// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
+// br label %else
+//
+// else:
+// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
+// % Mask1 = extractelement <16 x i1> %Mask, i32 1
+// % ToLoad1 = icmp eq i1 % Mask1, true
+// br i1 % ToLoad1, label %cond.load1, label %else2
+//
+// cond.load1:
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// % Load1 = load i32, i32* % Ptr1, align 4
+// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
+// br label %else2
+// . . .
+// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
+// ret <16 x i32> %Result
+static void scalarizeMaskedGather(CallInst *CI) {
+  Value *Ptrs = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+  unsigned VectorWidth = VecType->getNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      LoadInst *Load =
+          Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
+      VResult = Builder.CreateInsertElement(
+          VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx));
+    }
+    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
+    //  %ToLoad1 = icmp eq i1 %Mask1, true
+    //  br i1 %ToLoad1, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1),
+                                    "ToLoad" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    LoadInst *Load =
+        Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
+                                          "Res" + Twine(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+  }
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+// Translate a masked scatter intrinsic, like
+// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
+//                                  <16 x i1> %Mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set.
+//
+// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> % Mask, i32 0
+// % ToStore0 = icmp eq i1 % Mask0, true
+// br i1 %ToStore0, label %cond.store, label %else
+//
+// cond.store:
+// % Elt0 = extractelement <16 x i32> %Src, i32 0
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// store i32 %Elt0, i32* % Ptr0, align 4
+// br label %else
+//
+// else:
+// % Mask1 = extractelement <16 x i1> % Mask, i32 1
+// % ToStore1 = icmp eq i1 % Mask1, true
+// br i1 % ToStore1, label %cond.store1, label %else2
+//
+// cond.store1:
+// % Elt1 = extractelement <16 x i32> %Src, i32 1
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// store i32 % Elt1, i32* % Ptr1, align 4
+// br label %else2
+//   . . .
+static void scalarizeMaskedScatter(CallInst *CI) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptrs = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  assert(isa<VectorType>(Src->getType()) &&
+         "Unexpected data type in masked scatter intrinsic");
+  assert(isa<VectorType>(Ptrs->getType()) &&
+         isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
+         "Vector of pointers is expected in masked scatter intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  unsigned VectorWidth = Src->getType()->getVectorNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                   "Elt" + Twine(Idx));
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
+    //  % ToStore = icmp eq i1 % Mask1, true
+    //  br i1 % ToStore, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1),
+                                    "ToStore" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  % Elt1 = extractelement <16 x i32> %Src, i32 1
+    //  % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+    //  %store i32 % Elt1, i32* % Ptr1
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                 "Elt" + Twine(Idx));
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+}
+
+bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  bool EverMadeChange = false;
+
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  bool MadeChange = true;
+  while (MadeChange) {
+    MadeChange = false;
+    for (Function::iterator I = F.begin(); I != F.end();) {
+      BasicBlock *BB = &*I++;
+      bool ModifiedDTOnIteration = false;
+      MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
+
+      // Restart BB iteration if the dominator tree of the Function was changed
+      if (ModifiedDTOnIteration)
+        break;
+    }
+
+    EverMadeChange |= MadeChange;
+  }
+
+  return EverMadeChange;
+}
+
+bool ScalarizeMaskedMemIntrin::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
+  bool MadeChange = false;
+
+  BasicBlock::iterator CurInstIterator = BB.begin();
+  while (CurInstIterator != BB.end()) {
+    if (CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++))
+      MadeChange |= optimizeCallInst(CI, ModifiedDT);
+    if (ModifiedDT)
+      return true;
+  }
+
+  return MadeChange;
+}
+
+bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
+                                                bool &ModifiedDT) {
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+  if (II) {
+    switch (II->getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::masked_load: {
+      // Scalarize unsupported vector masked load
+      if (!TTI->isLegalMaskedLoad(CI->getType())) {
+        scalarizeMaskedLoad(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_store: {
+      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
+        scalarizeMaskedStore(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_gather: {
+      if (!TTI->isLegalMaskedGather(CI->getType())) {
+        scalarizeMaskedGather(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_scatter: {
+      if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
+        scalarizeMaskedScatter(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    }
+  }
+
+  return false;
+}
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c77046fdfaf5..caf5cb497a71 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -114,7 +114,7 @@ namespace {
     SmallPtrSet<SDNode *, 32> CombinedNodes;
 
     // AA - Used for DAG load/store alias analysis.
-    AliasAnalysis &AA;
+    AliasAnalysis *AA;
 
     /// When an instruction is simplified, add all users of the instruction to
     /// the work lists because they might get more simplified now.
@@ -496,9 +496,9 @@ namespace {
     SDValue distributeTruncateThroughAnd(SDNode *N);
 
   public:
-    DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
+    DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
         : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
-          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
+          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(AA) {
       ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
 
       MaximumLegalStoreInBits = 0;
@@ -1729,10 +1729,9 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
       NumLeftToConsider--;
   }
 
-  SDValue Result;
-
   // If we've changed things around then replace token factor.
   if (Changed) {
+    SDValue Result;
     if (Ops.empty()) {
       // The entry token is the only possible outcome.
       Result = DAG.getEntryNode();
@@ -1749,13 +1748,9 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
         Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
       }
     }
-
-    // Add users to worklist, since we may introduce a lot of new
-    // chained token factors while removing memory deps.
-    return CombineTo(N, Result, true /*add to worklist*/);
+    return Result;
   }
-
-  return Result;
+  return SDValue();
 }
 
 /// MERGE_VALUES can always be eliminated.
@@ -2131,17 +2126,17 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue CarryIn = N->getOperand(2);
+  SDLoc DL(N);
 
   // canonicalize constant to RHS
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
-    return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
-                       N1, N0, CarryIn);
+    return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);
 
   // fold (addcarry x, y, false) -> (uaddo x, y)
   if (isNullConstant(CarryIn))
-    return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(), N0, N1);
+    return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
 
   if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
     return Combined;
@@ -5313,17 +5308,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     }
   }
 
-  // If the target supports masking y in (shl, y),
-  // fold (shl x, (and y, ((1 << numbits(x)) - 1))) -> (shl x, y)
-  if (TLI.isOperationLegal(ISD::SHL, VT) &&
-      TLI.supportsModuloShift(ISD::SHL, VT) && N1->getOpcode() == ISD::AND) {
-    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
-      if (Mask->getZExtValue() == OpSizeInBits - 1) {
-        return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1->getOperand(0));
-      }
-    }
-  }
-
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // fold (shl c1, c2) -> c1<<c2
@@ -5331,7 +5315,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
   // fold (shl 0, x) -> 0
-  if (isNullConstant(N0))
+  if (isNullConstantOrNullSplatConstant(N0))
     return N0;
   // fold (shl x, c >= size(x)) -> undef
   if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
@@ -5522,18 +5506,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
-  // If the target supports masking y in (sra, y),
-  // fold (sra x, (and y, ((1 << numbits(x)) - 1))) -> (sra x, y)
-  if (TLI.isOperationLegal(ISD::SRA, VT) &&
-      TLI.supportsModuloShift(ISD::SRA, VT) && N1->getOpcode() == ISD::AND) {
-    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
-      if (Mask->getZExtValue() == OpSizeInBits - 1) {
-        return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, N1->getOperand(0));
-      }
-    }
-  }
-
   // Arithmetic shifting an all-sign-bit value is a no-op.
+  // fold (sra 0, x) -> 0
+  // fold (sra -1, x) -> -1
   if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
     return N0;
 
@@ -5548,12 +5523,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
-  // fold (sra 0, x) -> 0
-  if (isNullConstant(N0))
-    return N0;
-  // fold (sra -1, x) -> -1
-  if (isAllOnesConstant(N0))
-    return N0;
   // fold (sra x, c >= size(x)) -> undef
   if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
     return DAG.getUNDEF(VT);
@@ -5691,17 +5660,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
-  // If the target supports masking y in (srl, y),
-  // fold (srl x, (and y, ((1 << numbits(x)) - 1))) -> (srl x, y)
-  if (TLI.isOperationLegal(ISD::SRL, VT) &&
-      TLI.supportsModuloShift(ISD::SRL, VT) && N1->getOpcode() == ISD::AND) {
-    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
-      if (Mask->getZExtValue() == OpSizeInBits - 1) {
-        return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1->getOperand(0));
-      }
-    }
-  }
-
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
@@ -5714,7 +5672,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
   // fold (srl 0, x) -> 0
-  if (isNullConstant(N0))
+  if (isNullConstantOrNullSplatConstant(N0))
     return N0;
   // fold (srl x, c >= size(x)) -> undef
   if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
@@ -7365,14 +7323,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                         N0.getValueSizeInBits(),
                         std::min(Op.getValueSizeInBits(),
                                  VT.getSizeInBits()));
-    if (TruncatedBits.isSubsetOf(Known.Zero)) {
-      if (VT.bitsGT(Op.getValueType()))
-        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Op);
-      if (VT.bitsLT(Op.getValueType()))
-        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
-
-      return Op;
-    }
+    if (TruncatedBits.isSubsetOf(Known.Zero))
+      return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
   }
 
   // fold (zext (truncate (load x))) -> (zext (smaller load x))
@@ -7419,14 +7371,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     }
 
     if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
-      SDValue Op = N0.getOperand(0);
-      if (SrcVT.bitsLT(VT)) {
-        Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
-        AddToWorklist(Op.getNode());
-      } else if (SrcVT.bitsGT(VT)) {
-        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
-        AddToWorklist(Op.getNode());
-      }
+      SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
+      AddToWorklist(Op.getNode());
       return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
     }
   }
@@ -7440,11 +7386,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                            N0.getValueType()) ||
        !TLI.isZExtFree(N0.getValueType(), VT))) {
     SDValue X = N0.getOperand(0).getOperand(0);
-    if (X.getValueType().bitsLT(VT)) {
-      X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(X), VT, X);
-    } else if (X.getValueType().bitsGT(VT)) {
-      X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
-    }
+    X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask = Mask.zext(VT.getSizeInBits());
     SDLoc DL(N);
@@ -7669,14 +7611,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   }
 
   // fold (aext (truncate x))
-  if (N0.getOpcode() == ISD::TRUNCATE) {
-    SDValue TruncOp = N0.getOperand(0);
-    if (TruncOp.getValueType() == VT)
-      return TruncOp; // x iff x size == zext size.
-    if (TruncOp.getValueType().bitsGT(VT))
-      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, TruncOp);
-    return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, TruncOp);
-  }
+  if (N0.getOpcode() == ISD::TRUNCATE)
+    return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
 
   // Fold (aext (and (trunc x), cst)) -> (and x, cst)
   // if the trunc is not free.
@@ -7687,11 +7623,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
                           N0.getValueType())) {
     SDLoc DL(N);
     SDValue X = N0.getOperand(0).getOperand(0);
-    if (X.getValueType().bitsLT(VT)) {
-      X = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
-    } else if (X.getValueType().bitsGT(VT)) {
-      X = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
-    }
+    X = DAG.getAnyExtOrTrunc(X, DL, VT);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask = Mask.zext(VT.getSizeInBits());
     return DAG.getNode(ISD::AND, DL, VT,
@@ -14868,6 +14800,55 @@ SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) {
   return SDValue();
 }
 
+// Combine shuffles of splat-shuffles of the form:
+// shuffle (shuffle V, undef, splat-mask), undef, M
+// If splat-mask contains undef elements, we need to be careful about
+// introducing undef's in the folded mask which are not the result of composing
+// the masks of the shuffles.
+static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask,
+                                     ShuffleVectorSDNode *Splat,
+                                     SelectionDAG &DAG) {
+  ArrayRef<int> SplatMask = Splat->getMask();
+  assert(UserMask.size() == SplatMask.size() && "Mask length mismatch");
+
+  // Prefer simplifying to the splat-shuffle, if possible. This is legal if
+  // every undef mask element in the splat-shuffle has a corresponding undef
+  // element in the user-shuffle's mask or if the composition of mask elements
+  // would result in undef.
+  // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
+  // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
+  //   In this case it is not legal to simplify to the splat-shuffle because we
+  //   may be exposing the users of the shuffle an undef element at index 1
+  //   which was not there before the combine.
+  // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
+  //   In this case the composition of masks yields SplatMask, so it's ok to
+  //   simplify to the splat-shuffle.
+  // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
+  //   In this case the composed mask includes all undef elements of SplatMask
+  //   and in addition sets element zero to undef. It is safe to simplify to
+  //   the splat-shuffle.
+  auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
+                                       ArrayRef<int> SplatMask) {
+    for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
+      if (UserMask[i] != -1 && SplatMask[i] == -1 &&
+          SplatMask[UserMask[i]] != -1)
+        return false;
+    return true;
+  };
+  if (CanSimplifyToExistingSplat(UserMask, SplatMask))
+    return SDValue(Splat, 0);
+
+  // Create a new shuffle with a mask that is composed of the two shuffles'
+  // masks.
+  SmallVector<int, 32> NewMask;
+  for (int Idx : UserMask)
+    NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);
+
+  return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
+                              Splat->getOperand(0), Splat->getOperand(1),
+                              NewMask);
+}
+
 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
@@ -14914,6 +14895,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
   }
 
+  // A shuffle of a single vector that is a splat can always be folded.
+  if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0))
+    if (N1->isUndef() && N0Shuf->isSplat())
+      return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG);
+
   // If it is a splat, check if the argument vector is another splat or a
   // build_vector.
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
@@ -16381,17 +16367,17 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
     UseAA = false;
 #endif
 
-  if (UseAA &&
+  if (UseAA && AA &&
       Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
     int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset;
     int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset;
     AliasResult AAResult =
-        AA.alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
-                                UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
-                 MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
-                                UseTBAA ? Op1->getAAInfo() : AAMDNodes()));
+        AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
+                                 UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
+                  MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
+                                 UseTBAA ? Op1->getAAInfo() : AAMDNodes()) );
     if (AAResult == NoAlias)
       return false;
   }
@@ -16605,7 +16591,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
 }
 
 /// This is the entry point for the file.
-void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
+void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
                            CodeGenOpt::Level OptLevel) {
   /// This is the main entry point to this class.
   DAGCombiner(*this, AA, OptLevel).Run(Level);
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 8c98e3740f6d..5003b79974eb 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -622,7 +622,7 @@ bool FastISel::selectStackmap(const CallInst *I) {
   // have to worry about calling conventions and target-specific lowering code.
   // Instead we perform the call lowering right here.
   //
-  // CALLSEQ_START(0...)
+  // CALLSEQ_START(0, 0...)
   // STACKMAP(id, nbytes, ...)
   // CALLSEQ_END(0, 0)
   //
@@ -1150,16 +1150,16 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
       return true;
     }
 
-    unsigned Offset = 0;
+    // Byval arguments with frame indices were already handled after argument
+    // lowering and before isel.
+    const auto *Arg =
+        dyn_cast<Argument>(Address->stripInBoundsConstantOffsets());
+    if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX)
+      return true;
+
     Optional<MachineOperand> Op;
-    if (const auto *Arg = dyn_cast<Argument>(Address))
-      // Some arguments' frame index is recorded during argument lowering.
-      Offset = FuncInfo.getArgumentFrameIndex(Arg);
-    if (Offset)
-      Op = MachineOperand::CreateFI(Offset);
-    if (!Op)
-      if (unsigned Reg = lookUpRegForValue(Address))
-        Op = MachineOperand::CreateReg(Reg, false);
+    if (unsigned Reg = lookUpRegForValue(Address))
+      Op = MachineOperand::CreateReg(Reg, false);
 
     // If we have a VLA that has a "use" in a metadata node that's then used
     // here but it has no other uses, then we have a problem. E.g.,
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index cdf4d3a8b4e5..606b8952f3c1 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -85,7 +85,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
   MF = &mf;
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
-  MachineModuleInfo &MMI = MF->getMMI();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
   unsigned StackAlign = TFI->getStackAlignment();
 
@@ -214,33 +213,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
         if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(&I)))
           InitializeRegForValue(&I);
 
-      // Collect llvm.dbg.declare information. This is done now instead of
-      // during the initial isel pass through the IR so that it is done
-      // in a predictable order.
-      if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I)) {
-        assert(DI->getVariable() && "Missing variable");
-        assert(DI->getDebugLoc() && "Missing location");
-        if (MMI.hasDebugInfo()) {
-          // Don't handle byval struct arguments or VLAs, for example.
-          // Non-byval arguments are handled here (they refer to the stack
-          // temporary alloca at this point).
-          const Value *Address = DI->getAddress();
-          if (Address) {
-            if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
-              Address = BCI->getOperand(0);
-            if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
-              DenseMap<const AllocaInst *, int>::iterator SI =
-                StaticAllocaMap.find(AI);
-              if (SI != StaticAllocaMap.end()) { // Check for VLAs.
-                int FI = SI->second;
-                MF->setVariableDbgInfo(DI->getVariable(), DI->getExpression(),
-                                       FI, DI->getDebugLoc());
-              }
-            }
-          }
-        }
-      }
-
       // Decide the preferred extend type for a value.
       PreferredExtendType[&I] = getPreferredExtendForValue(&I);
     }
@@ -510,12 +482,11 @@ void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A,
 /// If the argument does not have any assigned frame index then 0 is
 /// returned.
 int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
-  DenseMap<const Argument *, int>::iterator I =
-    ByValArgFrameIndexMap.find(A);
+  auto I = ByValArgFrameIndexMap.find(A);
   if (I != ByValArgFrameIndexMap.end())
     return I->second;
   DEBUG(dbgs() << "Argument does not have assigned frame index!\n");
-  return 0;
+  return INT_MAX;
 }
 
 unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg(
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2654b3ad7a62..9a47a914df91 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1493,7 +1493,7 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   SDValue Size  = Tmp2.getOperand(1);
   SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
@@ -4187,6 +4187,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     ReplacedNode(Node);
     break;
   }
+  case ISD::MUL:
   case ISD::SDIV:
   case ISD::SREM:
   case ISD::UDIV:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index cde4331cc42d..4c3b514856b7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -675,6 +675,7 @@ private:
   // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
   bool SplitVectorOperand(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_UnaryOp(SDNode *N);
   SDValue SplitVecOp_TruncateHelper(SDNode *N);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 97a7fab6efd0..ff0e609803d8 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1513,6 +1513,22 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::ZERO_EXTEND_VECTOR_INREG:
       Res = SplitVecOp_ExtVecInRegOp(N);
       break;
+
+    case ISD::VECREDUCE_FADD:
+    case ISD::VECREDUCE_FMUL:
+    case ISD::VECREDUCE_ADD:
+    case ISD::VECREDUCE_MUL:
+    case ISD::VECREDUCE_AND:
+    case ISD::VECREDUCE_OR:
+    case ISD::VECREDUCE_XOR:
+    case ISD::VECREDUCE_SMAX:
+    case ISD::VECREDUCE_SMIN:
+    case ISD::VECREDUCE_UMAX:
+    case ISD::VECREDUCE_UMIN:
+    case ISD::VECREDUCE_FMAX:
+    case ISD::VECREDUCE_FMIN:
+      Res = SplitVecOp_VECREDUCE(N, OpNo);
+      break;
     }
   }
 
@@ -1565,6 +1581,48 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+
+  SDValue VecOp = N->getOperand(OpNo);
+  EVT VecVT = VecOp.getValueType();
+  assert(VecVT.isVector() && "Can only split reduce vector operand");
+  GetSplitVector(VecOp, Lo, Hi);
+  EVT LoOpVT, HiOpVT;
+  std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
+
+  bool NoNaN = N->getFlags().hasNoNaNs();
+  unsigned CombineOpc = 0;
+  switch (N->getOpcode()) {
+  case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
+  case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
+  case ISD::VECREDUCE_ADD:  CombineOpc = ISD::ADD; break;
+  case ISD::VECREDUCE_MUL:  CombineOpc = ISD::MUL; break;
+  case ISD::VECREDUCE_AND:  CombineOpc = ISD::AND; break;
+  case ISD::VECREDUCE_OR:   CombineOpc = ISD::OR; break;
+  case ISD::VECREDUCE_XOR:  CombineOpc = ISD::XOR; break;
+  case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
+  case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
+  case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
+  case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
+  case ISD::VECREDUCE_FMAX:
+    CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN;
+    break;
+  case ISD::VECREDUCE_FMIN:
+    CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN;
+    break;
+  default:
+    llvm_unreachable("Unexpected reduce ISD node");
+  }
+
+  // Use the appropriate scalar instruction on the split subvectors before
+  // reducing the now partially reduced smaller vector.
+  SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi);
+  return DAG.getNode(N->getOpcode(), dl, ResVT, Partial);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
   // The result has a legal vector type, but the input needs splitting.
   EVT ResVT = N->getValueType(0);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d605a1dc1c20..057badcd6b74 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2217,10 +2217,10 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
     // Also compute a conservative estimate for high known-0 bits.
     // More trickiness is possible, but this is sufficient for the
     // interesting case of alignment computation.
-    unsigned TrailZ = Known.Zero.countTrailingOnes() +
-                      Known2.Zero.countTrailingOnes();
-    unsigned LeadZ =  std::max(Known.Zero.countLeadingOnes() +
-                               Known2.Zero.countLeadingOnes(),
+    unsigned TrailZ = Known.countMinTrailingZeros() +
+                      Known2.countMinTrailingZeros();
+    unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
+                               Known2.countMinLeadingZeros(),
                                BitWidth) - BitWidth;
 
     Known.resetAll();
@@ -2233,13 +2233,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
-    unsigned LeadZ = Known2.Zero.countLeadingOnes();
+    unsigned LeadZ = Known2.countMinLeadingZeros();
 
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
-    unsigned RHSUnknownLeadingOnes = Known2.One.countLeadingZeros();
-    if (RHSUnknownLeadingOnes != BitWidth)
-      LeadZ = std::min(BitWidth,
-                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+    unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
+    if (RHSMaxLeadingZeros != BitWidth)
+      LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
 
     Known.Zero.setHighBits(LeadZ);
     break;
@@ -2359,7 +2358,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
   case ISD::CTTZ_ZERO_UNDEF: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     // If we have a known 1, its position is our upper bound.
-    unsigned PossibleTZ = Known2.One.countTrailingZeros();
+    unsigned PossibleTZ = Known2.countMaxTrailingZeros();
     unsigned LowBits = Log2_32(PossibleTZ) + 1;
     Known.Zero.setBitsFrom(LowBits);
     break;
@@ -2368,7 +2367,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
   case ISD::CTLZ_ZERO_UNDEF: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     // If we have a known 1, its position is our upper bound.
-    unsigned PossibleLZ = Known2.One.countLeadingZeros();
+    unsigned PossibleLZ = Known2.countMaxLeadingZeros();
     unsigned LowBits = Log2_32(PossibleLZ) + 1;
     Known.Zero.setBitsFrom(LowBits);
     break;
@@ -2376,7 +2375,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
   case ISD::CTPOP: {
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
     // If we know some of the bits are zero, they can't be one.
-    unsigned PossibleOnes = BitWidth - Known2.Zero.countPopulation();
+    unsigned PossibleOnes = Known2.countMaxPopulation();
     Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
     break;
   }
@@ -2493,13 +2492,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
     // going to be 0 in the result. Both addition and complement operations
     // preserve the low zero bits.
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
-    unsigned KnownZeroLow = Known2.Zero.countTrailingOnes();
+    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
     if (KnownZeroLow == 0)
       break;
 
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
-    KnownZeroLow = std::min(KnownZeroLow,
-                            Known2.Zero.countTrailingOnes());
+    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
     Known.Zero.setLowBits(KnownZeroLow);
     break;
   }
@@ -2526,15 +2524,13 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
     // and the other has the top 8 bits clear, we know the top 7 bits of the
     // output must be clear.
     computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
-    unsigned KnownZeroHigh = Known2.Zero.countLeadingOnes();
-    unsigned KnownZeroLow = Known2.Zero.countTrailingOnes();
+    unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
+    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
 
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
                      Depth + 1);
-    KnownZeroHigh = std::min(KnownZeroHigh,
-                             Known2.Zero.countLeadingOnes());
-    KnownZeroLow = std::min(KnownZeroLow,
-                            Known2.Zero.countTrailingOnes());
+    KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
+    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
 
     if (Opcode == ISD::ADDE || Opcode == ISD::ADDCARRY) {
       // With ADDE and ADDCARRY, a carry bit may be added in, so we can only
@@ -2594,8 +2590,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
     computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
     computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
 
-    uint32_t Leaders = std::max(Known.Zero.countLeadingOnes(),
-                                Known2.Zero.countLeadingOnes());
+    uint32_t Leaders =
+        std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     Known.resetAll();
     Known.Zero.setHighBits(Leaders);
     break;
@@ -2711,8 +2707,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
 
     // UMIN - we know that the result will have the maximum of the
     // known zero leading bits of the inputs.
-    unsigned LeadZero = Known.Zero.countLeadingOnes();
-    LeadZero = std::max(LeadZero, Known2.Zero.countLeadingOnes());
+    unsigned LeadZero = Known.countMinLeadingZeros();
+    LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());
 
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
@@ -2726,8 +2722,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
 
     // UMAX - we know that the result will have the maximum of the
     // known one leading bits of the inputs.
-    unsigned LeadOne = Known.One.countLeadingOnes();
-    LeadOne = std::max(LeadOne, Known2.One.countLeadingOnes());
+    unsigned LeadOne = Known.countMinLeadingOnes();
+    LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());
 
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
@@ -2843,8 +2839,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
   // Fall back to computeKnownBits to catch other known cases.
   KnownBits Known;
   computeKnownBits(Val, Known);
-  return (Known.Zero.countPopulation() == BitWidth - 1) &&
-         (Known.One.countPopulation() == 1);
+  return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
 }
 
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
@@ -2860,6 +2855,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && "Invalid VT!");
   unsigned VTBits = VT.getScalarSizeInBits();
+  unsigned NumElts = DemandedElts.getBitWidth();
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
@@ -2903,6 +2899,39 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     }
     return Tmp;
 
+  case ISD::VECTOR_SHUFFLE: {
+    // Collect the minimum number of sign bits that are shared by every vector
+    // element referenced by the shuffle.
+    APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+    assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int M = SVN->getMaskElt(i);
+      if (!DemandedElts[i])
+        continue;
+      // For UNDEF elements, we don't know anything about the common state of
+      // the shuffle result.
+      if (M < 0)
+        return 1;
+      if ((unsigned)M < NumElts)
+        DemandedLHS.setBit((unsigned)M % NumElts);
+      else
+        DemandedRHS.setBit((unsigned)M % NumElts);
+    }
+    Tmp = UINT_MAX;
+    if (!!DemandedLHS)
+      Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
+    if (!!DemandedRHS) {
+      Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    // If we don't know anything, early out and try computeKnownBits fall-back.
+    if (Tmp == 1)
+      break;
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+    return Tmp;
+  }
+
   case ISD::SIGN_EXTEND:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
     Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
@@ -3142,14 +3171,36 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
 
     return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
   }
-  case ISD::EXTRACT_SUBVECTOR:
-    return ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+  case ISD::EXTRACT_SUBVECTOR: {
+    // If we know the element index, just demand that subvector elements,
+    // otherwise demand them all.
+    SDValue Src = Op.getOperand(0);
+    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+      // Offset the demanded elts by the subvector index.
+      uint64_t Idx = SubIdx->getZExtValue();
+      APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
+      return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
+    }
+    return ComputeNumSignBits(Src, Depth + 1);
+  }
   case ISD::CONCAT_VECTORS:
-    // Determine the minimum number of sign bits across all input vectors.
-    // Early out if the result is already 1.
-    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
-    for (unsigned i = 1, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i)
-      Tmp = std::min(Tmp, ComputeNumSignBits(Op.getOperand(i), Depth + 1));
+    // Determine the minimum number of sign bits across all demanded
+    // elts of the input vectors. Early out if the result is already 1.
+    Tmp = UINT_MAX;
+    EVT SubVectorVT = Op.getOperand(0).getValueType();
+    unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
+    unsigned NumSubVectors = Op.getNumOperands();
+    for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
+      APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
+      DemandedSub = DemandedSub.trunc(NumSubVectorElts);
+      if (!DemandedSub)
+        continue;
+      Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
     return Tmp;
   }
 
@@ -3543,7 +3594,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid sext node, dst < src!");
     if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
-      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       // sext(undef) = 0, because the top bits will all be the same.
       return getConstant(0, DL, VT);
@@ -3559,8 +3610,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid zext node, dst < src!");
     if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
-      return getNode(ISD::ZERO_EXTEND, DL, VT,
-                     Operand.getNode()->getOperand(0));
+      return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       // zext(undef) = 0, because the top bits will be zero.
       return getConstant(0, DL, VT);
@@ -3579,13 +3629,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
         OpOpcode == ISD::ANY_EXTEND)
       // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
-      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
 
     // (ext (trunx x)) -> x
     if (OpOpcode == ISD::TRUNCATE) {
-      SDValue OpOp = Operand.getNode()->getOperand(0);
+      SDValue OpOp = Operand.getOperand(0);
       if (OpOp.getValueType() == VT)
         return OpOp;
     }
@@ -3601,16 +3651,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Operand.getValueType().bitsGT(VT) &&
            "Invalid truncate node, src < dst!");
     if (OpOpcode == ISD::TRUNCATE)
-      return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
         OpOpcode == ISD::ANY_EXTEND) {
       // If the source is smaller than the dest, we still need an extend.
-      if (Operand.getNode()->getOperand(0).getValueType().getScalarType()
+      if (Operand.getOperand(0).getValueType().getScalarType()
             .bitsLT(VT.getScalarType()))
-        return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
-      if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT))
-        return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
-      return Operand.getNode()->getOperand(0);
+        return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
+      if (Operand.getOperand(0).getValueType().bitsGT(VT))
+        return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
+      return Operand.getOperand(0);
     }
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
@@ -3665,15 +3715,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
     if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB)
       // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags?
-      return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1),
-                     Operand.getNode()->getOperand(0),
-                     Operand.getNode()->getFlags());
+      return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
+                     Operand.getOperand(0), Operand.getNode()->getFlags());
     if (OpOpcode == ISD::FNEG)  // --X -> X
-      return Operand.getNode()->getOperand(0);
+      return Operand.getOperand(0);
     break;
   case ISD::FABS:
     if (OpOpcode == ISD::FNEG)  // abs(-X) -> abs(X)
-      return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
     break;
   }
 
@@ -5970,7 +6019,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   unsigned NumOps = Ops.size();
   switch (NumOps) {
   case 0: return getNode(Opcode, DL, VT);
-  case 1: return getNode(Opcode, DL, VT, Ops[0]);
+  case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
   case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
   case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
   default: break;
@@ -7520,9 +7569,8 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
     unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
     KnownBits Known(PtrWidth);
-    llvm::computeKnownBits(const_cast<GlobalValue *>(GV), Known,
-                           getDataLayout());
-    unsigned AlignBits = Known.Zero.countTrailingOnes();
+    llvm::computeKnownBits(GV, Known, getDataLayout());
+    unsigned AlignBits = Known.countMinTrailingZeros();
     unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
     if (Align)
       return MinAlign(Align, GVOffset);
@@ -7621,7 +7669,7 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
     return false;
 
   // FIXME: The widths are based on this node's type, but build vectors can
-  // truncate their operands. 
+  // truncate their operands.
   SplatValue = APInt(VecWidth, 0);
   SplatUndef = APInt(VecWidth, 0);
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 50313e2da884..57d340c41c39 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -661,7 +661,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
 
       unsigned RegSize = RegisterVT.getSizeInBits();
       unsigned NumSignBits = LOI->NumSignBits;
-      unsigned NumZeroBits = LOI->Known.Zero.countLeadingOnes();
+      unsigned NumZeroBits = LOI->Known.countMinLeadingZeros();
 
       if (NumZeroBits == RegSize) {
         // The current value is a zero.
@@ -811,9 +811,9 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
   }
 }
 
-void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa,
                                const TargetLibraryInfo *li) {
-  AA = &aa;
+  AA = aa;
   GFI = gfi;
   LibInfo = li;
   DL = &DAG.getDataLayout();
@@ -3423,7 +3423,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   if (isVolatile || NumValues > MaxParallelChains)
     // Serialize volatile loads with other side effects.
     Root = getRoot();
-  else if (AA->pointsToConstantMemory(MemoryLocation(
+  else if (AA && AA->pointsToConstantMemory(MemoryLocation(
                SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
     Root = DAG.getEntryNode();
@@ -3535,8 +3535,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
   Type *Ty = I.getType();
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
-  assert(!AA->pointsToConstantMemory(MemoryLocation(
-             SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo)) &&
+  assert((!AA || !AA->pointsToConstantMemory(MemoryLocation(
+             SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) &&
          "load_from_swift_error should not be constant memory");
 
   SmallVector<EVT, 4> ValueVTs;
@@ -3817,7 +3817,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
   // Do not serialize masked loads of constant memory with anything.
-  bool AddToChain = !AA->pointsToConstantMemory(MemoryLocation(
+  bool AddToChain = !AA || !AA->pointsToConstantMemory(MemoryLocation(
       PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo));
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
 
@@ -3861,7 +3861,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
   bool ConstantMemory = false;
   if (UniformBase &&
-      AA->pointsToConstantMemory(MemoryLocation(
+      AA && AA->pointsToConstantMemory(MemoryLocation(
           BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()),
           AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
@@ -4676,7 +4676,8 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   bool IsIndirect = false;
   Optional<MachineOperand> Op;
   // Some arguments' frame index is recorded during argument lowering.
-  if (int FI = FuncInfo.getArgumentFrameIndex(Arg))
+  int FI = FuncInfo.getArgumentFrameIndex(Arg);
+  if (FI != INT_MAX)
     Op = MachineOperand::CreateFI(FI);
 
   if (!Op && N.getNode()) {
@@ -4927,6 +4928,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       return nullptr;
     }
 
+    // Byval arguments with frame indices were already handled after argument
+    // lowering and before isel.
+    const auto *Arg =
+        dyn_cast<Argument>(Address->stripInBoundsConstantOffsets());
+    if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX)
+      return nullptr;
+
     SDValue &N = NodeMap[Address];
     if (!N.getNode() && isa<Argument>(Address))
       // Check unused arguments map.
@@ -4957,20 +4965,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       // virtual register info from the FuncInfo.ValueMap.
       if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true,
                                     N)) {
-        // If variable is pinned by a alloca in dominating bb then
-        // use StaticAllocaMap.
-        if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
-          if (AI->getParent() != DI.getParent()) {
-            DenseMap<const AllocaInst*, int>::iterator SI =
-              FuncInfo.StaticAllocaMap.find(AI);
-            if (SI != FuncInfo.StaticAllocaMap.end()) {
-              SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second,
-                                              0, dl, SDNodeOrder);
-              DAG.AddDbgValue(SDV, nullptr, false);
-              return nullptr;
-            }
-          }
-        }
         DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       }
     }
@@ -5651,7 +5645,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       int FI = FuncInfo.StaticAllocaMap[Slot];
       MCSymbol *FrameAllocSym =
           MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
-              GlobalValue::getRealLinkageName(MF.getName()), Idx);
+              GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
               TII->get(TargetOpcode::LOCAL_ESCAPE))
           .addSym(FrameAllocSym)
@@ -5672,7 +5666,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     unsigned IdxVal = unsigned(Idx->getLimitedValue(INT_MAX));
     MCSymbol *FrameAllocSym =
         MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
-            GlobalValue::getRealLinkageName(Fn->getName()), IdxVal);
+            GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);
 
     // Create a MCSymbol for the label to avoid any target lowering
     // that would make this PC relative.
@@ -5737,6 +5731,24 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::experimental_deoptimize:
     LowerDeoptimizeCall(&I);
     return nullptr;
+
+  case Intrinsic::experimental_vector_reduce_fadd:
+  case Intrinsic::experimental_vector_reduce_fmul:
+  case Intrinsic::experimental_vector_reduce_add:
+  case Intrinsic::experimental_vector_reduce_mul:
+  case Intrinsic::experimental_vector_reduce_and:
+  case Intrinsic::experimental_vector_reduce_or:
+  case Intrinsic::experimental_vector_reduce_xor:
+  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::experimental_vector_reduce_fmax:
+  case Intrinsic::experimental_vector_reduce_fmin: {
+    visitVectorReduce(I, Intrinsic);
+    return nullptr;
+  }
+
   }
 }
 
@@ -5982,7 +5994,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   bool ConstantMemory = false;
 
   // Do not serialize (non-volatile) loads of constant memory with anything.
-  if (Builder.AA->pointsToConstantMemory(PtrVal)) {
+  if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) {
     Root = Builder.DAG.getEntryNode();
     ConstantMemory = true;
   } else {
@@ -7422,11 +7434,11 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
   // have to worry about calling conventions and target specific lowering code.
   // Instead we perform the call lowering right here.
   //
-  // chain, flag = CALLSEQ_START(chain, 0)
+  // chain, flag = CALLSEQ_START(chain, 0, 0)
   // chain, flag = STACKMAP(id, nbytes, ..., chain, flag)
   // chain, flag = CALLSEQ_END(chain, 0, 0, flag)
   //
-  Chain = DAG.getCALLSEQ_START(getRoot(), NullPtr, DL);
+  Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL);
   InFlag = Chain.getValue(1);
 
   // Add the <id> and <numBytes> constants.
@@ -7616,6 +7628,76 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   FuncInfo.MF->getFrameInfo().setHasPatchPoint();
 }
 
+void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
+                                            unsigned Intrinsic) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Op1 = getValue(I.getArgOperand(0));
+  SDValue Op2;
+  if (I.getNumArgOperands() > 1)
+    Op2 = getValue(I.getArgOperand(1));
+  SDLoc dl = getCurSDLoc();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  SDValue Res;
+  FastMathFlags FMF;
+  if (isa<FPMathOperator>(I))
+    FMF = I.getFastMathFlags();
+  SDNodeFlags SDFlags;
+  SDFlags.setNoNaNs(FMF.noNaNs());
+
+  switch (Intrinsic) {
+  case Intrinsic::experimental_vector_reduce_fadd:
+    if (FMF.unsafeAlgebra())
+      Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
+    else
+      Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
+    break;
+  case Intrinsic::experimental_vector_reduce_fmul:
+    if (FMF.unsafeAlgebra())
+      Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
+    else
+      Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
+    break;
+  case Intrinsic::experimental_vector_reduce_add:
+    Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_mul:
+    Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_and:
+    Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_or:
+    Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_xor:
+    Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_smax:
+    Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_smin:
+    Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_umax:
+    Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_umin:
+    Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_fmax: {
+    Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
+    break;
+  }
+  case Intrinsic::experimental_vector_reduce_fmin: {
+    Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
+    break;
+  }
+  default:
+    llvm_unreachable("Unhandled vector reduce intrinsic");
+  }
+  setValue(&I, Res);
+}
+
 /// Returns an AttributeList representing the attributes applied to the return
 /// value of the given call.
 static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 9e9989058ae5..bdaee858da61 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -604,11 +604,11 @@ public:
   SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
                       CodeGenOpt::Level ol)
     : CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()),
-      DAG(dag), FuncInfo(funcinfo),
+      DAG(dag), DL(nullptr), AA(nullptr), FuncInfo(funcinfo),
       HasTailCall(false) {
   }
 
-  void init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+  void init(GCFunctionInfo *gfi, AliasAnalysis *AA,
             const TargetLibraryInfo *li);
 
   /// Clear out the current SelectionDAG and the associated state and prepare
@@ -909,6 +909,8 @@ private:
   void visitGCRelocate(const GCRelocateInst &I);
   void visitGCResult(const GCResultInst &I);
 
+  void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
+
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 26dd45ef933f..c37d7080f2c5 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -346,6 +346,19 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     case ISD::SETFALSE:                 return "setfalse";
     case ISD::SETFALSE2:                return "setfalse2";
     }
+  case ISD::VECREDUCE_FADD:             return "vecreduce_fadd";
+  case ISD::VECREDUCE_FMUL:             return "vecreduce_fmul";
+  case ISD::VECREDUCE_ADD:              return "vecreduce_add";
+  case ISD::VECREDUCE_MUL:              return "vecreduce_mul";
+  case ISD::VECREDUCE_AND:              return "vecreduce_and";
+  case ISD::VECREDUCE_OR:               return "vecreduce_or";
+  case ISD::VECREDUCE_XOR:              return "vecreduce_xor";
+  case ISD::VECREDUCE_SMAX:             return "vecreduce_smax";
+  case ISD::VECREDUCE_SMIN:             return "vecreduce_smin";
+  case ISD::VECREDUCE_UMAX:             return "vecreduce_umax";
+  case ISD::VECREDUCE_UMIN:             return "vecreduce_umin";
+  case ISD::VECREDUCE_FMAX:             return "vecreduce_fmax";
+  case ISD::VECREDUCE_FMIN:             return "vecreduce_fmin";
   }
 }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 3aabdaeaa094..5e0feccb6b4c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -38,6 +38,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -299,7 +300,7 @@ SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
   FuncInfo(new FunctionLoweringInfo()),
   CurDAG(new SelectionDAG(tm, OL)),
   SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
-  GFI(),
+  AA(), GFI(),
   OptLevel(OL),
   DAGSize(0) {
     initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
@@ -317,7 +318,8 @@ SelectionDAGISel::~SelectionDAGISel() {
 }
 
 void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AAResultsWrapperPass>();
+  if (OptLevel != CodeGenOpt::None)
+    AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<GCModuleInfo>();
   AU.addRequired<StackProtector>();
   AU.addPreserved<StackProtector>();
@@ -394,7 +396,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   TII = MF->getSubtarget().getInstrInfo();
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
   ORE = make_unique<OptimizationRemarkEmitter>(&Fn);
@@ -406,12 +407,22 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   CurDAG->init(*MF, *ORE);
   FuncInfo->set(Fn, *MF, CurDAG);
 
+  // Now get the optional analyzes if we want to.
+  // This is based on the possibly changed OptLevel (after optnone is taken
+  // into account).  That's unfortunate but OK because it just means we won't
+  // ask for passes that have been required anyway.
+
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
   else
     FuncInfo->BPI = nullptr;
 
-  SDB->init(GFI, *AA, LibInfo);
+  if (OptLevel != CodeGenOpt::None)
+    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  else
+    AA = nullptr;
+
+  SDB->init(GFI, AA, LibInfo);
 
   MF->setHasInlineAsm(false);
 
@@ -715,7 +726,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   {
     NamedRegionTimer T("combine1", "DAG Combining 1", GroupName,
                        GroupDescription, TimePassesIsEnabled);
-    CurDAG->Combine(BeforeLegalizeTypes, *AA, OptLevel);
+    CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel);
   }
 
   DEBUG(dbgs() << "Optimized lowered selection DAG: BB#" << BlockNumber
@@ -747,7 +758,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     {
       NamedRegionTimer T("combine_lt", "DAG Combining after legalize types",
                          GroupName, GroupDescription, TimePassesIsEnabled);
-      CurDAG->Combine(AfterLegalizeTypes, *AA, OptLevel);
+      CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel);
     }
 
     DEBUG(dbgs() << "Optimized type-legalized selection DAG: BB#" << BlockNumber
@@ -781,7 +792,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     {
       NamedRegionTimer T("combine_lv", "DAG Combining after legalize vectors",
                          GroupName, GroupDescription, TimePassesIsEnabled);
-      CurDAG->Combine(AfterLegalizeVectorOps, *AA, OptLevel);
+      CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel);
     }
 
     DEBUG(dbgs() << "Optimized vector-legalized selection DAG: BB#"
@@ -807,7 +818,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   {
     NamedRegionTimer T("combine2", "DAG Combining 2", GroupName,
                        GroupDescription, TimePassesIsEnabled);
-    CurDAG->Combine(AfterLegalizeDAG, *AA, OptLevel);
+    CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel);
   }
 
   DEBUG(dbgs() << "Optimized legalized selection DAG: BB#" << BlockNumber
@@ -1145,6 +1156,51 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,
   }
 }
 
+/// Collect llvm.dbg.declare information. This is done after argument lowering
+/// in case the declarations refer to arguments.
+static void processDbgDeclares(FunctionLoweringInfo *FuncInfo) {
+  MachineFunction *MF = FuncInfo->MF;
+  const DataLayout &DL = MF->getDataLayout();
+  for (const BasicBlock &BB : *FuncInfo->Fn) {
+    for (const Instruction &I : BB) {
+      const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I);
+      if (!DI)
+        continue;
+
+      assert(DI->getVariable() && "Missing variable");
+      assert(DI->getDebugLoc() && "Missing location");
+      const Value *Address = DI->getAddress();
+      if (!Address)
+        continue;
+
+      // Look through casts and constant offset GEPs. These mostly come from
+      // inalloca.
+      APInt Offset(DL.getPointerSizeInBits(0), 0);
+      Address = Address->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+
+      // Check if the variable is a static alloca or a byval or inalloca
+      // argument passed in memory. If it is not, then we will ignore this
+      // intrinsic and handle this during isel like dbg.value.
+      int FI = INT_MAX;
+      if (const auto *AI = dyn_cast<AllocaInst>(Address)) {
+        auto SI = FuncInfo->StaticAllocaMap.find(AI);
+        if (SI != FuncInfo->StaticAllocaMap.end())
+          FI = SI->second;
+      } else if (const auto *Arg = dyn_cast<Argument>(Address))
+        FI = FuncInfo->getArgumentFrameIndex(Arg);
+
+      if (FI == INT_MAX)
+        continue;
+
+      DIExpression *Expr = DI->getExpression();
+      if (Offset.getBoolValue())
+        Expr = DIExpression::prepend(Expr, DIExpression::NoDeref,
+                                     Offset.getZExtValue());
+      MF->setVariableDbgInfo(DI->getVariable(), Expr, FI, DI->getDebugLoc());
+    }
+  }
+}
+
 /// Propagate swifterror values through the machine function CFG.
 static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) {
   auto *TLI = FuncInfo->TLI;
@@ -1317,6 +1373,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   }
   createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB);
 
+  processDbgDeclares(FuncInfo);
+
   // Iterate over all basic blocks in the function.
   for (const BasicBlock *LLVMBB : RPOT) {
     if (OptLevel != CodeGenOpt::None) {
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 23f597db140c..befbd80d7965 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -417,11 +417,10 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
     if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
         TLI.isZExtFree(SmallVT, Op.getValueType())) {
       // We found a type with free casts.
-      SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT,
-                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
-                                          Op.getNode()->getOperand(0)),
-                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
-                                          Op.getNode()->getOperand(1)));
+      SDValue X = DAG.getNode(
+          Op.getOpcode(), dl, SmallVT,
+          DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
+          DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1)));
       bool NeedZext = DemandedSize > SmallVTBits;
       SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND,
                               dl, Op.getValueType(), X);
@@ -817,7 +816,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
       // are not demanded. This will likely allow the anyext to be folded away.
       if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) {
-        SDValue InnerOp = InOp.getNode()->getOperand(0);
+        SDValue InnerOp = InOp.getOperand(0);
         EVT InnerVT = InnerOp.getValueType();
         unsigned InnerBits = InnerVT.getSizeInBits();
         if (ShAmt < InnerBits && NewMask.getActiveBits() <= InnerBits &&
diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp
index 4837495777da..2638702da152 100644
--- a/lib/CodeGen/ShrinkWrap.cpp
+++ b/lib/CodeGen/ShrinkWrap.cpp
@@ -282,8 +282,14 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
 
   if (!Restore)
     Restore = &MBB;
-  else
+  else if (MPDT->getNode(&MBB)) // If the block is not in the post dom tree, it
+                                // means the block never returns. If that's the
+                                // case, we don't want to call
+                                // `findNearestCommonDominator`, which will
+                                // return `Restore`.
     Restore = MPDT->findNearestCommonDominator(Restore, &MBB);
+  else
+    Restore = nullptr; // Abort, we can't find a restore point in this case.
 
   // Make sure we would be able to insert the restore code before the
   // terminator.
@@ -293,7 +299,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
         continue;
       // One of the terminator needs to happen before the restore point.
       if (MBB.succ_empty()) {
-        Restore = nullptr;
+        Restore = nullptr; // Abort, we can't find a restore point in this case.
         break;
       }
       // Look for a restore point that post-dominates all the successors.
@@ -419,7 +425,7 @@ static bool isIrreducibleCFG(const MachineFunction &MF,
 }
 
 bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.empty() || !isShrinkWrapEnabled(MF))
+  if (skipFunction(*MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF))
     return false;
 
   DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index ab578df4069d..e9eff4d0acb2 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -93,8 +93,8 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
                                       doubleUnderDataTy, // __data
                                       VoidPtrTy,         // __personality
                                       VoidPtrTy,         // __lsda
-                                      doubleUnderJBufTy, // __jbuf
-                                      nullptr);
+                                      doubleUnderJBufTy  // __jbuf
+                                      );
 
   return true;
 }
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 34892680aceb..1d232c71d824 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -232,7 +232,11 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
   if (!MD)
     return nullptr;
 
-  auto *VM = dyn_cast<ValueAsMetadata>(MD->getOperand(0));
+  const MDOperand &Op = MD->getOperand(0);
+  if (!Op.get())
+    return nullptr;
+
+  auto *VM = dyn_cast<ValueAsMetadata>(Op);
   if (!VM)
     report_fatal_error("MD_associated operand is not ValueAsMetadata");
 
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index 150195f5f85b..e6c5d8753b83 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -487,6 +487,14 @@ void TargetPassConfig::addIRPasses() {
 
   // Insert calls to mcount-like functions.
   addPass(createCountingFunctionInserterPass());
+
+  // Add scalarization of target's unsupported masked memory intrinsics pass.
+  // the unsupported intrinsic will be replaced with a chain of basic blocks,
+  // that stores/loads element one-by-one if the appropriate mask bit is set.
+  addPass(createScalarizeMaskedMemIntrinPass());
+
+  // Expand reduction intrinsics into shuffle sequences if the target wants to.
+  addPass(createExpandReductionsPass());
 }
 
 /// Turn exception handling constructs into something the code generators can
@@ -607,6 +615,9 @@ void TargetPassConfig::addMachinePasses() {
     addPass(&LocalStackSlotAllocationID, false);
   }
 
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(&LiveRangeShrinkID);
+
   // Run pre-ra passes.
   addPreRegAlloc();
 
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 75359fe3c0ea..7392c8327148 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -155,7 +155,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<AAResultsWrapperPass>();
+    AU.addUsedIfAvailable<AAResultsWrapperPass>();
     AU.addUsedIfAvailable<LiveVariables>();
     AU.addPreserved<LiveVariables>();
     AU.addPreserved<SlotIndexes>();
@@ -1627,7 +1627,10 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   InstrItins = MF->getSubtarget().getInstrItineraryData();
   LV = getAnalysisIfAvailable<LiveVariables>();
   LIS = getAnalysisIfAvailable<LiveIntervals>();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  if (auto *AAPass = getAnalysisIfAvailable<AAResultsWrapperPass>())
+    AA = &AAPass->getAAResults();
+  else
+    AA = nullptr;
   OptLevel = TM.getOptLevel();
 
   bool MadeChange = false;
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index f085132b6a94..407fd9b162e9 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -206,11 +206,12 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
         if (InputReg != OutputReg) {
           MachineRegisterInfo &MRI = F.getRegInfo();
           unsigned InputSub = Input.getSubReg();
-          if (InputSub == 0) {
-            MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg));
+          if (InputSub == 0 &&
+              MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg))) {
             MRI.replaceRegWith(OutputReg, InputReg);
           } else {
-            // The input register to the PHI has a subregister:
+            // The input register to the PHI has a subregister or it can't be
+            // constrained to the proper register class:
             // insert a COPY instead of simply replacing the output
             // with the input.
             const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo();
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index 410d5a3777d4..8d9353ae5f5e 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -13,7 +13,7 @@ add_llvm_library(LLVMDebugInfoCodeView
   ModuleDebugFragmentVisitor.cpp
   ModuleDebugInlineeLinesFragment.cpp
   ModuleDebugLineFragment.cpp
-  ModuleDebugUnknownFragment.cpp
+  RandomAccessTypeVisitor.cpp
   RecordSerialization.cpp
   StringTable.cpp
   SymbolRecordMapping.cpp
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index 0069ee3cc904..b6ed0453d9c4 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -26,8 +26,7 @@ CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
     : Callbacks(Callbacks) {}
 
 template <typename T>
-static Error visitKnownRecord(CVTypeVisitor &Visitor, CVType &Record,
-                              TypeVisitorCallbacks &Callbacks) {
+static Error visitKnownRecord(CVType &Record, TypeVisitorCallbacks &Callbacks) {
   TypeRecordKind RK = static_cast<TypeRecordKind>(Record.Type);
   T KnownRecord(RK);
   if (auto EC = Callbacks.visitKnownRecord(Record, KnownRecord))
@@ -76,7 +75,7 @@ void CVTypeVisitor::addTypeServerHandler(TypeServerHandler &Handler) {
   Handlers.push_back(&Handler);
 }
 
-Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
+Expected<bool> CVTypeVisitor::handleTypeServer(CVType &Record) {
   if (Record.Type == TypeLeafKind::LF_TYPESERVER2 && !Handlers.empty()) {
     auto TS = deserializeTypeServerRecord(Record);
     if (!TS)
@@ -90,16 +89,16 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
 
       // If the handler processed the record, return success.
       if (*ExpectedResult)
-        return Error::success();
+        return true;
 
       // Otherwise keep searching for a handler, eventually falling out and
       // using the default record handler.
     }
   }
+  return false;
+}
 
-  if (auto EC = Callbacks.visitTypeBegin(Record))
-    return EC;
-
+Error CVTypeVisitor::finishVisitation(CVType &Record) {
   switch (Record.Type) {
   default:
     if (auto EC = Callbacks.visitUnknownType(Record))
@@ -107,7 +106,7 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
     break;
 #define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
   case EnumName: {                                                             \
-    if (auto EC = visitKnownRecord<Name##Record>(*this, Record, Callbacks))    \
+    if (auto EC = visitKnownRecord<Name##Record>(Record, Callbacks))           \
       return EC;                                                               \
     break;                                                                     \
   }
@@ -124,6 +123,32 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
   return Error::success();
 }
 
+Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) {
+  auto ExpectedResult = handleTypeServer(Record);
+  if (!ExpectedResult)
+    return ExpectedResult.takeError();
+  if (*ExpectedResult)
+    return Error::success();
+
+  if (auto EC = Callbacks.visitTypeBegin(Record, Index))
+    return EC;
+
+  return finishVisitation(Record);
+}
+
+Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
+  auto ExpectedResult = handleTypeServer(Record);
+  if (!ExpectedResult)
+    return ExpectedResult.takeError();
+  if (*ExpectedResult)
+    return Error::success();
+
+  if (auto EC = Callbacks.visitTypeBegin(Record))
+    return EC;
+
+  return finishVisitation(Record);
+}
+
 static Error visitMemberRecord(CVMemberRecord &Record,
                                TypeVisitorCallbacks &Callbacks) {
   if (auto EC = Callbacks.visitMemberBegin(Record))
diff --git a/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp b/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp
deleted file mode 100644
index 9fd2cb8ed3e8..000000000000
--- a/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-//===- ModuleDebugUnknownFragment.cpp ---------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h"
-\ No newline at end of file
diff --git a/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
new file mode 100644
index 000000000000..4cb9acbe07d9
--- /dev/null
+++ b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
@@ -0,0 +1,91 @@
+//===- RandomAccessTypeVisitor.cpp ---------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h"
+
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+RandomAccessTypeVisitor::RandomAccessTypeVisitor(
+    const CVTypeArray &Types, uint32_t NumRecords,
+    PartialOffsetArray PartialOffsets)
+    : Database(NumRecords), Types(Types), DatabaseVisitor(Database),
+      InternalVisitor(Pipeline), PartialOffsets(PartialOffsets) {
+  Pipeline.addCallbackToPipeline(Deserializer);
+  Pipeline.addCallbackToPipeline(DatabaseVisitor);
+
+  KnownOffsets.resize(Database.capacity());
+}
+
+Error RandomAccessTypeVisitor::visitTypeIndex(TypeIndex TI,
+                                              TypeVisitorCallbacks &Callbacks) {
+  assert(TI.toArrayIndex() < Database.capacity());
+
+  if (!Database.contains(TI)) {
+    if (auto EC = visitRangeForType(TI))
+      return EC;
+  }
+
+  assert(Database.contains(TI));
+  auto &Record = Database.getTypeRecord(TI);
+  CVTypeVisitor V(Callbacks);
+  return V.visitTypeRecord(Record, TI);
+}
+
+Error RandomAccessTypeVisitor::visitRangeForType(TypeIndex TI) {
+  if (PartialOffsets.empty()) {
+    TypeIndex TIB(TypeIndex::FirstNonSimpleIndex);
+    TypeIndex TIE = TIB + Database.capacity();
+    return visitRange(TIB, 0, TIE);
+  }
+
+  auto Next = std::upper_bound(PartialOffsets.begin(), PartialOffsets.end(), TI,
+                               [](TypeIndex Value, const TypeIndexOffset &IO) {
+                                 return Value < IO.Type;
+                               });
+
+  assert(Next != PartialOffsets.begin());
+  auto Prev = std::prev(Next);
+
+  TypeIndex TIB = Prev->Type;
+  TypeIndex TIE;
+  if (Next == PartialOffsets.end()) {
+    TIE = TypeIndex::fromArrayIndex(Database.capacity());
+  } else {
+    TIE = Next->Type;
+  }
+
+  if (auto EC = visitRange(TIB, Prev->Offset, TIE))
+    return EC;
+  return Error::success();
+}
+
+Error RandomAccessTypeVisitor::visitRange(TypeIndex Begin, uint32_t BeginOffset,
+                                          TypeIndex End) {
+
+  auto RI = Types.at(BeginOffset);
+  assert(RI != Types.end());
+
+  while (Begin != End) {
+    assert(!Database.contains(Begin));
+    if (auto EC = InternalVisitor.visitTypeRecord(*RI, Begin))
+      return EC;
+    KnownOffsets[Begin.toArrayIndex()] = BeginOffset;
+
+    BeginOffset += RI.getRecordLength();
+    ++Begin;
+    ++RI;
+  }
+
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/TypeDatabase.cpp b/lib/DebugInfo/CodeView/TypeDatabase.cpp
index 5b8841041f88..7924440e5e29 100644
--- a/lib/DebugInfo/CodeView/TypeDatabase.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabase.cpp
@@ -65,20 +65,32 @@ static const SimpleTypeEntry SimpleTypeNames[] = {
     {"__bool64*", SimpleTypeKind::Boolean64},
 };
 
-TypeDatabase::TypeDatabase(uint32_t ExpectedSize) : TypeNameStorage(Allocator) {
-  CVUDTNames.reserve(ExpectedSize);
-  TypeRecords.reserve(ExpectedSize);
+TypeDatabase::TypeDatabase(uint32_t Capacity) : TypeNameStorage(Allocator) {
+  CVUDTNames.resize(Capacity);
+  TypeRecords.resize(Capacity);
+  ValidRecords.resize(Capacity);
 }
 
-/// Gets the type index for the next type record.
-TypeIndex TypeDatabase::getNextTypeIndex() const {
-  return TypeIndex(TypeIndex::FirstNonSimpleIndex + CVUDTNames.size());
+TypeIndex TypeDatabase::appendType(StringRef Name, const CVType &Data) {
+  TypeIndex TI;
+  TI = getAppendIndex();
+  if (TI.toArrayIndex() >= capacity())
+    grow();
+  recordType(Name, TI, Data);
+  return TI;
 }
 
-/// Records the name of a type, and reserves its type index.
-void TypeDatabase::recordType(StringRef Name, const CVType &Data) {
-  CVUDTNames.push_back(Name);
-  TypeRecords.push_back(Data);
+void TypeDatabase::recordType(StringRef Name, TypeIndex Index,
+                              const CVType &Data) {
+  uint32_t AI = Index.toArrayIndex();
+
+  assert(!contains(Index));
+  assert(AI < capacity());
+
+  CVUDTNames[AI] = Name;
+  TypeRecords[AI] = Data;
+  ValidRecords.set(AI);
+  ++Count;
 }
 
 /// Saves the name in a StringSet and creates a stable StringRef.
@@ -104,24 +116,47 @@ StringRef TypeDatabase::getTypeName(TypeIndex Index) const {
     return "<unknown simple type>";
   }
 
-  uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
-  if (I < CVUDTNames.size())
-    return CVUDTNames[I];
+  if (contains(Index))
+    return CVUDTNames[Index.toArrayIndex()];
 
   return "<unknown UDT>";
 }
 
 const CVType &TypeDatabase::getTypeRecord(TypeIndex Index) const {
-  return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex];
+  assert(contains(Index));
+  return TypeRecords[Index.toArrayIndex()];
 }
 
 CVType &TypeDatabase::getTypeRecord(TypeIndex Index) {
-  return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex];
+  assert(contains(Index));
+  return TypeRecords[Index.toArrayIndex()];
+}
+
+bool TypeDatabase::contains(TypeIndex Index) const {
+  uint32_t AI = Index.toArrayIndex();
+  if (AI >= capacity())
+    return false;
+
+  return ValidRecords.test(AI);
 }
 
-bool TypeDatabase::containsTypeIndex(TypeIndex Index) const {
-  uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
-  return I < CVUDTNames.size();
+uint32_t TypeDatabase::size() const { return Count; }
+
+uint32_t TypeDatabase::capacity() const { return TypeRecords.size(); }
+
+void TypeDatabase::grow() {
+  TypeRecords.emplace_back();
+  CVUDTNames.emplace_back();
+  ValidRecords.resize(ValidRecords.size() + 1);
 }
 
-uint32_t TypeDatabase::size() const { return CVUDTNames.size(); }
+bool TypeDatabase::empty() const { return size() == 0; }
+
+TypeIndex TypeDatabase::getAppendIndex() const {
+  if (empty())
+    return TypeIndex::fromArrayIndex(0);
+
+  int Index = ValidRecords.find_last();
+  assert(Index != -1);
+  return TypeIndex::fromArrayIndex(Index) + 1;
+}
diff --git a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
index c234afd2288b..8d97f8b1cb40 100644
--- a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
@@ -15,7 +15,7 @@ using namespace llvm;
 
 using namespace llvm::codeview;
 
-Error TypeDatabaseVisitor::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
+Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record) {
   assert(!IsInFieldList);
   // Reset Name to the empty string. If the visitor sets it, we know it.
   Name = "";
@@ -28,6 +28,22 @@ Error TypeDatabaseVisitor::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
   return Error::success();
 }
 
+Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
+  if (auto EC = visitTypeBegin(Record))
+    return EC;
+
+  CurrentTypeIndex = Index;
+  return Error::success();
+}
+
+StringRef TypeDatabaseVisitor::getTypeName(TypeIndex Index) const {
+  return TypeDB->getTypeName(Index);
+}
+
+StringRef TypeDatabaseVisitor::saveTypeName(StringRef Name) {
+  return TypeDB->saveTypeName(Name);
+}
+
 Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
   if (CVR.Type == LF_FIELDLIST) {
     assert(IsInFieldList);
@@ -39,7 +55,12 @@ Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
   // CVUDTNames is indexed by type index, and must have one entry for every
   // type.  Field list members are not recorded, and are only referenced by
   // their containing field list record.
-  TypeDB.recordType(Name, CVR);
+  if (CurrentTypeIndex)
+    TypeDB->recordType(Name, *CurrentTypeIndex, CVR);
+  else
+    TypeDB->appendType(Name, CVR);
+
+  CurrentTypeIndex.reset();
   return Error::success();
 }
 
@@ -73,13 +94,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
   uint32_t Size = Indices.size();
   SmallString<256> TypeName("(");
   for (uint32_t I = 0; I < Size; ++I) {
-    StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+    StringRef ArgTypeName = getTypeName(Indices[I]);
     TypeName.append(ArgTypeName);
     if (I + 1 != Size)
       TypeName.append(", ");
   }
   TypeName.push_back(')');
-  Name = TypeDB.saveTypeName(TypeName);
+  Name = saveTypeName(TypeName);
   return Error::success();
 }
 
@@ -89,13 +110,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
   uint32_t Size = Indices.size();
   SmallString<256> TypeName("\"");
   for (uint32_t I = 0; I < Size; ++I) {
-    StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+    StringRef ArgTypeName = getTypeName(Indices[I]);
     TypeName.append(ArgTypeName);
     if (I + 1 != Size)
       TypeName.append("\" \"");
   }
   TypeName.push_back('\"');
-  Name = TypeDB.saveTypeName(TypeName);
+  Name = saveTypeName(TypeName);
   return Error::success();
 }
 
@@ -132,26 +153,26 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
 
 Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
                                             ProcedureRecord &Proc) {
-  StringRef ReturnTypeName = TypeDB.getTypeName(Proc.getReturnType());
-  StringRef ArgListTypeName = TypeDB.getTypeName(Proc.getArgumentList());
+  StringRef ReturnTypeName = getTypeName(Proc.getReturnType());
+  StringRef ArgListTypeName = getTypeName(Proc.getArgumentList());
   SmallString<256> TypeName(ReturnTypeName);
   TypeName.push_back(' ');
   TypeName.append(ArgListTypeName);
-  Name = TypeDB.saveTypeName(TypeName);
+  Name = saveTypeName(TypeName);
   return Error::success();
 }
 
 Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
                                             MemberFunctionRecord &MF) {
-  StringRef ReturnTypeName = TypeDB.getTypeName(MF.getReturnType());
-  StringRef ClassTypeName = TypeDB.getTypeName(MF.getClassType());
-  StringRef ArgListTypeName = TypeDB.getTypeName(MF.getArgumentList());
+  StringRef ReturnTypeName = getTypeName(MF.getReturnType());
+  StringRef ClassTypeName = getTypeName(MF.getClassType());
+  StringRef ArgListTypeName = getTypeName(MF.getArgumentList());
   SmallString<256> TypeName(ReturnTypeName);
   TypeName.push_back(' ');
   TypeName.append(ClassTypeName);
   TypeName.append("::");
   TypeName.append(ArgListTypeName);
-  Name = TypeDB.saveTypeName(TypeName);
+  Name = saveTypeName(TypeName);
   return Error::success();
 }
 
@@ -171,13 +192,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
   if (Ptr.isPointerToMember()) {
     const MemberPointerInfo &MI = Ptr.getMemberInfo();
 
-    StringRef PointeeName = TypeDB.getTypeName(Ptr.getReferentType());
-    StringRef ClassName = TypeDB.getTypeName(MI.getContainingType());
+    StringRef PointeeName = getTypeName(Ptr.getReferentType());
+    StringRef ClassName = getTypeName(MI.getContainingType());
     SmallString<256> TypeName(PointeeName);
     TypeName.push_back(' ');
     TypeName.append(ClassName);
     TypeName.append("::*");
-    Name = TypeDB.saveTypeName(TypeName);
+    Name = saveTypeName(TypeName);
   } else {
     SmallString<256> TypeName;
     if (Ptr.isConst())
@@ -187,7 +208,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
     if (Ptr.isUnaligned())
       TypeName.append("__unaligned ");
 
-    TypeName.append(TypeDB.getTypeName(Ptr.getReferentType()));
+    TypeName.append(getTypeName(Ptr.getReferentType()));
 
     if (Ptr.getMode() == PointerMode::LValueReference)
       TypeName.append("&");
@@ -197,7 +218,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
       TypeName.append("*");
 
     if (!TypeName.empty())
-      Name = TypeDB.saveTypeName(TypeName);
+      Name = saveTypeName(TypeName);
   }
   return Error::success();
 }
@@ -205,7 +226,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
 Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
   uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
 
-  StringRef ModifiedName = TypeDB.getTypeName(Mod.getModifiedType());
+  StringRef ModifiedName = getTypeName(Mod.getModifiedType());
   SmallString<256> TypeName;
   if (Mods & uint16_t(ModifierOptions::Const))
     TypeName.append("const ");
@@ -214,14 +235,14 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
   if (Mods & uint16_t(ModifierOptions::Unaligned))
     TypeName.append("__unaligned ");
   TypeName.append(ModifiedName);
-  Name = TypeDB.saveTypeName(TypeName);
+  Name = saveTypeName(TypeName);
   return Error::success();
 }
 
 Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
                                             VFTableShapeRecord &Shape) {
-  Name = TypeDB.saveTypeName("<vftable " + utostr(Shape.getEntryCount()) +
-                             " methods>");
+  Name =
+      saveTypeName("<vftable " + utostr(Shape.getEntryCount()) + " methods>");
   return Error::success();
 }
 
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 870d95221e7d..27a6e0987886 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -173,10 +173,13 @@ void TypeDumpVisitor::printItemIndex(StringRef FieldName, TypeIndex TI) const {
 }
 
 Error TypeDumpVisitor::visitTypeBegin(CVType &Record) {
+  TypeIndex TI = getSourceDB().getAppendIndex();
+  return visitTypeBegin(Record, TI);
+}
+
+Error TypeDumpVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
   W->startLine() << getLeafTypeName(Record.Type);
-  W->getOStream() << " ("
-                  << HexNumber(getSourceDB().getNextTypeIndex().getIndex())
-                  << ")";
+  W->getOStream() << " (" << HexNumber(Index.getIndex()) << ")";
   W->getOStream() << " {\n";
   W->indent();
   W->printEnum("TypeLeafKind", unsigned(Record.Type),
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 246899ac12b9..59a060d143ff 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -66,7 +66,7 @@ uint64_t llvm::getRelocatedValue(const DataExtractor &Data, uint32_t Size,
   RelocAddrMap::const_iterator AI = Relocs->find(*Off);
   if (AI == Relocs->end())
     return Data.getUnsigned(Off, Size);
-  return Data.getUnsigned(Off, Size) + AI->second.second;
+  return Data.getUnsigned(Off, Size) + AI->second.Value;
 }
 
 static void dumpAccelSection(raw_ostream &OS, StringRef Name,
@@ -905,16 +905,23 @@ static Error createError(const Twine &Reason, llvm::Error E) {
 /// Returns the address of symbol relocation used against. Used for futher
 /// relocations computation. Symbol's section load address is taken in account if
 /// LoadedObjectInfo interface is provided.
-static Expected<uint64_t> getSymbolAddress(const object::ObjectFile &Obj,
-                                           const RelocationRef &Reloc,
-                                           const LoadedObjectInfo *L) {
+static Expected<uint64_t>
+getSymbolAddress(const object::ObjectFile &Obj, const RelocationRef &Reloc,
+                 const LoadedObjectInfo *L,
+                 std::map<SymbolRef, uint64_t> &Cache) {
   uint64_t Ret = 0;
   object::section_iterator RSec = Obj.section_end();
   object::symbol_iterator Sym = Reloc.getSymbol();
 
+  std::map<SymbolRef, uint64_t>::iterator CacheIt = Cache.end();
   // First calculate the address of the symbol or section as it appears
   // in the object file
   if (Sym != Obj.symbol_end()) {
+    bool New;
+    std::tie(CacheIt, New) = Cache.insert({*Sym, 0});
+    if (!New)
+      return CacheIt->second;
+
     Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
     if (!SymAddrOrErr)
       return createError("error: failed to compute symbol address: ",
@@ -943,6 +950,10 @@ static Expected<uint64_t> getSymbolAddress(const object::ObjectFile &Obj,
   if (L && RSec != Obj.section_end())
     if (uint64_t SectionLoadAddress = L->getSectionLoadAddress(*RSec))
       Ret += SectionLoadAddress - RSec->getAddress();
+
+  if (CacheIt != Cache.end())
+    CacheIt->second = Ret;
+
   return Ret;
 }
 
@@ -1075,6 +1086,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
         continue;
     }
 
+    std::map<SymbolRef, uint64_t> AddrCache;
     if (Section.relocation_begin() != Section.relocation_end()) {
       uint64_t SectionSize = RelocatedSection->getSize();
       for (const RelocationRef &Reloc : Section.relocations()) {
@@ -1083,7 +1095,8 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
         if (isRelocScattered(Obj, Reloc))
           continue;
 
-        Expected<uint64_t> SymAddrOrErr = getSymbolAddress(Obj, Reloc, L);
+        Expected<uint64_t> SymAddrOrErr =
+            getSymbolAddress(Obj, Reloc, L, AddrCache);
         if (!SymAddrOrErr) {
           errs() << toString(SymAddrOrErr.takeError()) << '\n';
           continue;
@@ -1114,7 +1127,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
                      << " at " << format("%p", Address)
                      << " with width " << format("%d", R.Width)
                      << "\n");
-        Map->insert(std::make_pair(Address, std::make_pair(R.Width, R.Value)));
+        Map->insert({Address, {(uint8_t)R.Width, R.Value}});
       }
     }
   }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index 0cf71f530446..6601393d7459 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -54,9 +54,8 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
     if (ParsedCUOffsets.insert(CUOffset).second) {
       DWARFAddressRangesVector CURanges;
       CU->collectAddressRanges(CURanges);
-      for (const auto &R : CURanges) {
-        appendRange(CUOffset, R.first, R.second);
-      }
+      for (const auto &R : CURanges)
+        appendRange(CUOffset, R.LowPC, R.HighPC);
     }
   }
 
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index 9380fe8fe85d..8da797750abd 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -69,8 +69,8 @@ DWARFDebugRangeList::getAbsoluteRanges(uint64_t BaseAddress) const {
     if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
       BaseAddress = RLE.EndAddress;
     } else {
-      Res.push_back(std::make_pair(BaseAddress + RLE.StartAddress,
-                                   BaseAddress + RLE.EndAddress));
+      Res.push_back(
+          {BaseAddress + RLE.StartAddress, BaseAddress + RLE.EndAddress});
     }
   }
   return Res;
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 24039eb35209..e3bd759ba94b 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -60,8 +60,8 @@ static void dumpRanges(raw_ostream &OS, const DWARFAddressRangesVector& Ranges,
     OS << '\n';
     OS.indent(Indent);
     OS << format("[0x%0*" PRIx64 " - 0x%0*" PRIx64 ")",
-                 AddressSize*2, Range.first,
-                 AddressSize*2, Range.second);
+                 AddressSize*2, Range.LowPC,
+                 AddressSize*2, Range.HighPC);
   }
 }
 
@@ -229,9 +229,9 @@ DWARFDie::getAddressRanges() const {
     return DWARFAddressRangesVector();
   // Single range specified by low/high PC.
   uint64_t LowPC, HighPC;
-  if (getLowAndHighPC(LowPC, HighPC)) {
-    return DWARFAddressRangesVector(1, std::make_pair(LowPC, HighPC));
-  }
+  if (getLowAndHighPC(LowPC, HighPC))
+    return {{LowPC, HighPC}};
+
   // Multiple ranges from .debug_ranges section.
   auto RangesOffset = toSectionOffset(find(DW_AT_ranges));
   if (RangesOffset) {
@@ -257,7 +257,7 @@ DWARFDie::collectChildrenAddressRanges(DWARFAddressRangesVector& Ranges) const {
 
 bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const {
   for (const auto& R : getAddressRanges()) {
-    if (R.first <= Address && Address < R.second)
+    if (R.LowPC <= Address && Address < R.HighPC)
       return true;
   }
   return false;
diff --git a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index e0f819383289..25824f6eb83b 100644
--- a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -24,7 +24,11 @@ bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
     return false;
   TypeHash = debug_info.getU64(offset_ptr);
   TypeOffset = debug_info.getU32(offset_ptr);
-  return TypeOffset < getLength();
+  // TypeOffset is relative to the beginning of the header,
+  // so we have to account for the leading length field.
+  // FIXME: The size of the length field is 12 in DWARF64.
+  unsigned SizeOfLength = 4;
+  return TypeOffset < getLength() + SizeOfLength;
 }
 
 void DWARFTypeUnit::dump(raw_ostream &OS, bool SummarizeTypes) {
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index f50487fc3ba3..3835d4da9ae9 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -349,18 +349,18 @@ void DWARFUnit::updateAddressDieMap(DWARFDie Die) {
   if (Die.isSubroutineDIE()) {
     for (const auto &R : Die.getAddressRanges()) {
       // Ignore 0-sized ranges.
-      if (R.first == R.second)
+      if (R.LowPC == R.HighPC)
         continue;
-      auto B = AddrDieMap.upper_bound(R.first);
-      if (B != AddrDieMap.begin() && R.first < (--B)->second.first) {
+      auto B = AddrDieMap.upper_bound(R.LowPC);
+      if (B != AddrDieMap.begin() && R.LowPC < (--B)->second.first) {
         // The range is a sub-range of existing ranges, we need to split the
         // existing range.
-        if (R.second < B->second.first)
-          AddrDieMap[R.second] = B->second;
-        if (R.first > B->first)
-          AddrDieMap[B->first].first = R.first;
+        if (R.HighPC < B->second.first)
+          AddrDieMap[R.HighPC] = B->second;
+        if (R.LowPC > B->first)
+          AddrDieMap[B->first].first = R.LowPC;
       }
-      AddrDieMap[R.first] = std::make_pair(R.second, Die);
+      AddrDieMap[R.LowPC] = std::make_pair(R.HighPC, Die);
     }
   }
   // Parent DIEs are added to the AddrDieMap prior to the Children DIEs to
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 9494e876da15..8a544296f65c 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 using namespace dwarf;
 using namespace object;
 
-void DWARFVerifier::verifyDebugInfoAttribute(DWARFDie &Die,
+void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
                                              DWARFAttribute &AttrValue) {
   const auto Attr = AttrValue.Attr;
   switch (Attr) {
@@ -68,7 +68,7 @@ void DWARFVerifier::verifyDebugInfoAttribute(DWARFDie &Die,
   }
 }
 
-void DWARFVerifier::verifyDebugInfoForm(DWARFDie &Die,
+void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
                                         DWARFAttribute &AttrValue) {
   const auto Form = AttrValue.Value.getForm();
   switch (Form) {
@@ -136,7 +136,7 @@ void DWARFVerifier::verifyDebugInfoForm(DWARFDie &Die,
   }
 }
 
-void DWARFVerifier::veifyDebugInfoReferences() {
+void DWARFVerifier::verifyDebugInfoReferences() {
   // Take all references and make sure they point to an actual DIE by
   // getting the DIE by offset and emitting an error
   OS << "Verifying .debug_info references...\n";
@@ -172,7 +172,7 @@ bool DWARFVerifier::handleDebugInfo() {
       }
     }
   }
-  veifyDebugInfoReferences();
+  verifyDebugInfoReferences();
   return NumDebugInfoErrors == 0;
 }
 
diff --git a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index 375c35b11145..701a318511b8 100644
--- a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -109,7 +109,7 @@ uint32_t TpiStreamBuilder::calculateHashBufferSize() const {
 }
 
 uint32_t TpiStreamBuilder::calculateIndexOffsetSize() const {
-  return TypeIndexOffsets.size() * sizeof(TypeIndexOffset);
+  return TypeIndexOffsets.size() * sizeof(codeview::TypeIndexOffset);
 }
 
 Error TpiStreamBuilder::finalizeMsfLayout() {
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index a5100a56bcf1..a27573f93b97 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -94,9 +94,8 @@ class OrcMCJITReplacement : public ExecutionEngine {
       return ClientMM->registerEHFrames(Addr, LoadAddr, Size);
     }
 
-    void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
-                            size_t Size) override {
-      return ClientMM->deregisterEHFrames(Addr, LoadAddr, Size);
+    void deregisterEHFrames() override {
+      return ClientMM->deregisterEHFrames();
     }
 
     void notifyObjectLoaded(RuntimeDyld &RTDyld,
diff --git a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index de73fbde8eb7..99e84b7496d4 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -134,6 +134,18 @@ void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr,
 
 #endif
 
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
+                                          size_t Size) {
+  registerEHFramesInProcess(Addr, Size);
+  EHFrames.push_back({Addr, Size});
+}
+
+void RTDyldMemoryManager::deregisterEHFrames() {
+  for (auto &Frame : EHFrames)
+    deregisterEHFramesInProcess(Frame.Addr, Frame.Size);
+  EHFrames.clear();
+}
+
 static int jit_noop() {
   return 0;
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index df9d2ceba329..e9a4b71c903d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -73,7 +73,9 @@ namespace llvm {
 
 void RuntimeDyldImpl::registerEHFrames() {}
 
-void RuntimeDyldImpl::deregisterEHFrames() {}
+void RuntimeDyldImpl::deregisterEHFrames() {
+  MemMgr.deregisterEHFrames();
+}
 
 #ifndef NDEBUG
 static void dumpSectionMemory(const SectionEntry &S, StringRef State) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 50f63fb8dd39..660843765b3f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -221,22 +221,10 @@ void RuntimeDyldELF::registerEHFrames() {
     uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
     size_t EHFrameSize = Sections[EHFrameSID].getSize();
     MemMgr.registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
-    RegisteredEHFrameSections.push_back(EHFrameSID);
   }
   UnregisteredEHFrameSections.clear();
 }
 
-void RuntimeDyldELF::deregisterEHFrames() {
-  for (int i = 0, e = RegisteredEHFrameSections.size(); i != e; ++i) {
-    SID EHFrameSID = RegisteredEHFrameSections[i];
-    uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress();
-    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
-    size_t EHFrameSize = Sections[EHFrameSID].getSize();
-    MemMgr.deregisterEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
-  }
-  RegisteredEHFrameSections.clear();
-}
-
 std::unique_ptr<RuntimeDyldELF>
 llvm::RuntimeDyldELF::create(Triple::ArchType Arch,
                              RuntimeDyld::MemoryManager &MemMgr,
@@ -802,20 +790,35 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section,
     writeInt32BE(LocalAddress, Delta / 2);
     break;
   }
+  case ELF::R_390_PC16: {
+    int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
+    assert(int16_t(Delta) == Delta && "R_390_PC16 overflow");
+    writeInt16BE(LocalAddress, Delta);
+    break;
+  }
   case ELF::R_390_PC32: {
     int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
     assert(int32_t(Delta) == Delta && "R_390_PC32 overflow");
     writeInt32BE(LocalAddress, Delta);
     break;
   }
-  case ELF::R_390_64:
-    writeInt64BE(LocalAddress, Value + Addend);
-    break;
   case ELF::R_390_PC64: {
     int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
     writeInt64BE(LocalAddress, Delta);
     break;
   }
+  case ELF::R_390_8:
+    *LocalAddress = (uint8_t)(Value + Addend);
+    break;
+  case ELF::R_390_16:
+    writeInt16BE(LocalAddress, Value + Addend);
+    break;
+  case ELF::R_390_32:
+    writeInt32BE(LocalAddress, Value + Addend);
+    break;
+  case ELF::R_390_64:
+    writeInt64BE(LocalAddress, Value + Addend);
+    break;
   }
 }
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 84dd810101f3..fb5da6dd8bbb 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -152,7 +152,6 @@ private:
   // in a table until we receive a request to register all unregistered
   // EH frame sections with the memory manager.
   SmallVector<SID, 2> UnregisteredEHFrameSections;
-  SmallVector<SID, 2> RegisteredEHFrameSections;
 
   // Map between GOT relocation value and corresponding GOT offset
   std::map<RelocationValueRef, uint64_t> GOTOffsetMap;
@@ -180,7 +179,6 @@ public:
                        StubMap &Stubs) override;
   bool isCompatibleFile(const object::ObjectFile &Obj) const override;
   void registerEHFrames() override;
-  void deregisterEHFrames() override;
   Error finalizeLoad(const ObjectFile &Obj,
                      ObjSectionToIDMap &SectionMap) override;
 };
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index f5cc883d98fd..18c23c5a2a5d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -515,7 +515,7 @@ public:
 
   virtual void registerEHFrames();
 
-  virtual void deregisterEHFrames();
+  void deregisterEHFrames();
 
   virtual Error finalizeLoad(const ObjectFile &ObjImg,
                              ObjSectionToIDMap &SectionMap) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
index 0398413e1532..6aa1a2bdb926 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
@@ -217,7 +217,6 @@ public:
   }
 
   void registerEHFrames() override {}
-  void deregisterEHFrames() override {}
 };
 
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
index 8c6af0bd9c6d..318afa21a88b 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
@@ -316,7 +316,6 @@ public:
   }
 
   void registerEHFrames() override {}
-  void deregisterEHFrames() override {}
 };
 
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index 109beb36f1ee..26e73989d7ed 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -194,9 +194,6 @@ public:
     }
     UnregisteredEHFrameSections.clear();
   }
-  void deregisterEHFrames() override {
-    // Stub
-  }
   Error finalizeLoad(const ObjectFile &Obj,
                      ObjSectionToIDMap &SectionMap) override {
     // Look for and record the EH frame section IDs.
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index b85ba210afb3..e93c79cfcec6 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -656,7 +656,8 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
       SMR.WaitClient();
       size_t Size = SMR.ReadByteArraySize();
       SMR.WriteByteArray(nullptr, 0);
-      F->RunOne(SMR.GetByteArray(), Size);
+      const Unit tmp(SMR.GetByteArray(), SMR.GetByteArray() + Size);
+      F->RunOne(tmp.data(), tmp.size());
       SMR.PostServer();
     }
     return 0;
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 0a1ff1b1df6a..7ff196c8fa96 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -92,10 +92,10 @@ FUZZER_FLAG_INT(print_pcs, 0, "If 1, print out newly covered PCs.")
 FUZZER_FLAG_INT(print_final_stats, 0, "If 1, print statistics at exit.")
 FUZZER_FLAG_INT(print_corpus_stats, 0,
   "If 1, print statistics on corpus elements at exit.")
-FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information at exit."
-                                   " Experimental, only with trace-pc-guard")
-FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information at exit."
-                                  " Experimental, only with trace-pc-guard")
+FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information as text"
+                                   " at exit.")
+FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information as a"
+                                  " .sancov file at exit.")
 FUZZER_FLAG_INT(handle_segv, 1, "If 1, try to intercept SIGSEGV.")
 FUZZER_FLAG_INT(handle_bus, 1, "If 1, try to intercept SIGBUS.")
 FUZZER_FLAG_INT(handle_abrt, 1, "If 1, try to intercept SIGABRT.")
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index ad067ee2c0d9..5f184c2316e2 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -91,6 +91,7 @@ public:
 private:
   void AlarmCallback();
   void CrashCallback();
+  void CrashOnOverwrittenData();
   void InterruptCallback();
   void MutateAndTestOne();
   void ReportNewCoverage(InputInfo *II, const Unit &U);
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index d84c3dbdaf77..14caa203c5ef 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -422,6 +422,24 @@ size_t Fuzzer::GetCurrentUnitInFuzzingThead(const uint8_t **Data) const {
   return CurrentUnitSize;
 }
 
+void Fuzzer::CrashOnOverwrittenData() {
+  Printf("==%d== ERROR: libFuzzer: fuzz target overwrites it's const input\n",
+         GetPid());
+  DumpCurrentUnit("crash-");
+  Printf("SUMMARY: libFuzzer: out-of-memory\n");
+  _Exit(Options.ErrorExitCode); // Stop right now.
+}
+
+// Compare two arrays, but not all bytes if the arrays are large.
+static bool LooseMemeq(const uint8_t *A, const uint8_t *B, size_t Size) {
+  const size_t Limit = 64;
+  if (Size <= 64)
+    return !memcmp(A, B, Size);
+  // Compare first and last Limit/2 bytes.
+  return !memcmp(A, B, Limit / 2) &&
+         !memcmp(A + Size - Limit / 2, B + Size - Limit / 2, Limit / 2);
+}
+
 void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
   assert(InFuzzingThread());
   if (SMR.IsClient())
@@ -443,6 +461,8 @@ void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
   (void)Res;
   assert(Res == 0);
   HasMoreMallocsThanFrees = AllocTracer.Stop();
+  if (!LooseMemeq(DataCopy, Data, Size))
+    CrashOnOverwrittenData();
   CurrentUnitSize = 0;
   delete[] DataCopy;
 }
diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp
index cd846c7deec5..e60d4130de10 100644
--- a/lib/Fuzzer/FuzzerMutate.cpp
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@@ -217,11 +217,12 @@ DictionaryEntry MutationDispatcher::MakeDictionaryEntryFromCMP(
     size_t NumPositions = 0;
     for (const uint8_t *Cur = Data;
          Cur < End && NumPositions < kMaxNumPositions; Cur++) {
-      Cur = (uint8_t *)SearchMemory(Cur, End - Cur, ExistingBytes, ArgSize);
+      Cur =
+          (const uint8_t *)SearchMemory(Cur, End - Cur, ExistingBytes, ArgSize);
       if (!Cur) break;
       Positions[NumPositions++] = Cur - Data;
     }
-    if (!NumPositions) break;
+    if (!NumPositions) continue;
     return DictionaryEntry(W, Positions[Rand(NumPositions)]);
   }
   DictionaryEntry DE(W);
diff --git a/lib/Fuzzer/afl/afl_driver.cpp b/lib/Fuzzer/afl/afl_driver.cpp
index b3a54e57fceb..3815ed11cf60 100644
--- a/lib/Fuzzer/afl/afl_driver.cpp
+++ b/lib/Fuzzer/afl/afl_driver.cpp
@@ -59,6 +59,11 @@ statistics from the file. If that fails then the process will quit.
 #include <signal.h>
 #include <sys/resource.h>
 #include <sys/time.h>
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+
 // Platform detection. Copied from FuzzerInternal.h
 #ifdef __linux__
 #define LIBFUZZER_LINUX 1
@@ -245,17 +250,39 @@ extern "C" size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize) {
   return 0;
 }
 
+// Execute any files provided as parameters.
+int ExecuteFilesOnyByOne(int argc, char **argv) {
+  for (int i = 1; i < argc; i++) {
+    std::ifstream in(argv[i]);
+    in.seekg(0, in.end);
+    size_t length = in.tellg();
+    in.seekg (0, in.beg);
+    std::cout << "Reading " << length << " bytes from " << argv[i] << std::endl;
+    // Allocate exactly length bytes so that we reliably catch buffer overflows.
+    std::vector<char> bytes(length);
+    in.read(bytes.data(), bytes.size());
+    assert(in);
+    LLVMFuzzerTestOneInput(reinterpret_cast<const uint8_t *>(bytes.data()),
+                           bytes.size());
+    std::cout << "Execution successfull" << std::endl;
+  }
+  return 0;
+}
+
 int main(int argc, char **argv) {
-  fprintf(stderr, "======================= INFO =========================\n"
-                  "This binary is built for AFL-fuzz.\n"
-                  "To run the target function on a single input execute this:\n"
-                  "  %s < INPUT_FILE\n"
-                  "To run the fuzzing execute this:\n"
-                  "  afl-fuzz [afl-flags] %s [N] "
-                  "-- run N fuzzing iterations before "
-                  "re-spawning the process (default: 1000)\n"
-                  "======================================================\n",
-          argv[0], argv[0]);
+  fprintf(stderr,
+      "======================= INFO =========================\n"
+      "This binary is built for AFL-fuzz.\n"
+      "To run the target function on individual input(s) execute this:\n"
+      "  %s < INPUT_FILE\n"
+      "or\n"
+      "  %s INPUT_FILE1 [INPUT_FILE2 ... ]\n"
+      "To fuzz with afl-fuzz execute this:\n"
+      "  afl-fuzz [afl-flags] %s [-N]\n"
+      "afl-fuzz will run N iterations before "
+      "re-spawning the process (default: 1000)\n"
+      "======================================================\n",
+          argv[0], argv[0], argv[0]);
   if (LLVMFuzzerInitialize)
     LLVMFuzzerInitialize(&argc, &argv);
   // Do any other expensive one-time initialization here.
@@ -266,8 +293,14 @@ int main(int argc, char **argv) {
   __afl_manual_init();
 
   int N = 1000;
-  if (argc >= 2)
-    N = atoi(argv[1]);
+  if (argc == 2 && argv[1][0] == '-')
+      N = atoi(argv[1] + 1);
+  else if(argc == 2 && (N = atoi(argv[1])) > 0)
+      fprintf(stderr, "WARNING: using the deprecated call style `%s %d`\n",
+              argv[0], N);
+  else if (argc > 1)
+    return ExecuteFilesOnyByOne(argc, argv);
+
   assert(N > 0);
   time_t unit_time_secs;
   int num_runs = 0;
diff --git a/lib/Fuzzer/test/AFLDriverTest.cpp b/lib/Fuzzer/test/AFLDriverTest.cpp
index 3dd0b6117305..e3f5f7100883 100644
--- a/lib/Fuzzer/test/AFLDriverTest.cpp
+++ b/lib/Fuzzer/test/AFLDriverTest.cpp
@@ -4,19 +4,25 @@
 // Contains dummy functions used to avoid dependency on AFL.
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdio.h>
 
 extern "C" void __afl_manual_init() {}
 
-extern "C" int __afl_persistent_loop(unsigned int) {
+extern "C" int __afl_persistent_loop(unsigned int N) {
+  static int Count = N;
+  fprintf(stderr, "__afl_persistent_loop calle, Count = %d\n", Count);
+  if (Count--) return 1;
   return 0;
 }
 
 // This declaration exists to prevent the Darwin linker
 // from complaining about this being a missing weak symbol.
 extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
+  fprintf(stderr, "LLVMFuzzerInitialize called\n");
   return 0;
 }
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  fprintf(stderr, "LLVMFuzzerTestOneInput called; Size = %zd\n", Size);
   return 0;
 }
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index cd049d3f03d8..b39938a705f6 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -104,6 +104,7 @@ set(Tests
   OneHugeAllocTest
   OutOfMemoryTest
   OutOfMemorySingleLargeMallocTest
+  OverwriteInputTest
   RepeatedMemcmp
   RepeatedBytesTest
   SimpleCmpTest
diff --git a/lib/Fuzzer/test/OverwriteInputTest.cpp b/lib/Fuzzer/test/OverwriteInputTest.cpp
new file mode 100644
index 000000000000..e688682346a6
--- /dev/null
+++ b/lib/Fuzzer/test/OverwriteInputTest.cpp
@@ -0,0 +1,13 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer. Make sure we abort if Data is overwritten.
+#include <cstdint>
+#include <iostream>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size)
+    *const_cast<uint8_t*>(Data) = 1;
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/afl-driver.test b/lib/Fuzzer/test/afl-driver.test
new file mode 100644
index 000000000000..6eab23cc3636
--- /dev/null
+++ b/lib/Fuzzer/test/afl-driver.test
@@ -0,0 +1,26 @@
+REQUIRES: linux
+RUN: echo -n "abc" > %t.file3
+RUN: echo -n "abcd" > %t.file4
+
+RUN: AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefix=CHECK1
+CHECK1: __afl_persistent_loop calle, Count = 1000
+CHECK1: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest < %t.file3 -42 2>&1 | FileCheck %s --check-prefix=CHECK2
+CHECK2: __afl_persistent_loop calle, Count = 42
+CHECK2: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest < %t.file3 666 2>&1 | FileCheck %s --check-prefix=CHECK3
+CHECK3: WARNING: using the deprecated call style
+CHECK3: __afl_persistent_loop calle, Count = 666
+CHECK3: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest %t.file3 2>&1 | FileCheck %s --check-prefix=CHECK4
+CHECK4: LLVMFuzzerTestOneInput called; Size = 3
+
+RUN: AFLDriverTest %t.file3 %t.file4  2>&1 | FileCheck %s --check-prefix=CHECK5
+CHECK5: LLVMFuzzerTestOneInput called; Size = 3
+CHECK5: LLVMFuzzerTestOneInput called; Size = 4
diff --git a/lib/Fuzzer/test/overwrite-input.test b/lib/Fuzzer/test/overwrite-input.test
new file mode 100644
index 000000000000..81c27909e8df
--- /dev/null
+++ b/lib/Fuzzer/test/overwrite-input.test
@@ -0,0 +1,2 @@
+RUN: not LLVMFuzzer-OverwriteInputTest 2>&1 | FileCheck %s
+CHECK: ERROR: libFuzzer: fuzz target overwrites it's const input
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 4c6e3e3788bd..ec4663018bd4 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -805,6 +805,9 @@ void SlotTracker::processModule() {
     if (!Var.hasName())
       CreateModuleSlot(&Var);
     processGlobalObjectMetadata(Var);
+    auto Attrs = Var.getAttributes();
+    if (Attrs.hasAttributes())
+      CreateAttributeSetSlot(Attrs);
   }
 
   for (const GlobalAlias &A : TheModule->aliases()) {
@@ -2502,6 +2505,10 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   GV->getAllMetadata(MDs);
   printMetadataAttachments(MDs, ", ");
 
+  auto Attrs = GV->getAttributes();
+  if (Attrs.hasAttributes())
+    Out << " #" << Machine.getAttributeGroupSlot(Attrs);
+
   printInfoComment(*GV);
 }
 
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index cf2925254695..acfac316e91e 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -1,4 +1,4 @@
-//===-- AttributeImpl.h - Attribute Internals -------------------*- C++ -*-===//
+//===- AttributeImpl.h - Attribute Internals --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,9 +21,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/Support/TrailingObjects.h"
-#include <algorithm>
 #include <cassert>
-#include <climits>
 #include <cstddef>
 #include <cstdint>
 #include <string>
@@ -80,11 +78,13 @@ public:
     else
       Profile(ID, getKindAsString(), getValueAsString());
   }
+
   static void Profile(FoldingSetNodeID &ID, Attribute::AttrKind Kind,
                       uint64_t Val) {
     ID.AddInteger(Kind);
     if (Val) ID.AddInteger(Val);
   }
+
   static void Profile(FoldingSetNodeID &ID, StringRef Kind, StringRef Values) {
     ID.AddString(Kind);
     if (!Values.empty()) ID.AddString(Values);
@@ -114,9 +114,10 @@ public:
 };
 
 class IntAttributeImpl : public EnumAttributeImpl {
-  void anchor() override;
   uint64_t Val;
 
+  void anchor() override;
+
 public:
   IntAttributeImpl(Attribute::AttrKind Kind, uint64_t Val)
       : EnumAttributeImpl(IntAttrEntry, Kind), Val(Val) {
@@ -188,20 +189,22 @@ public:
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
   std::string getAsString(bool InAttrGrp) const;
 
-  typedef const Attribute *iterator;
+  using iterator = const Attribute *;
+
   iterator begin() const { return getTrailingObjects<Attribute>(); }
   iterator end() const { return begin() + NumAttrs; }
 
   void Profile(FoldingSetNodeID &ID) const {
     Profile(ID, makeArrayRef(begin(), end()));
   }
+
   static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
     for (const auto &Attr : AttrList)
       Attr.Profile(ID);
   }
 };
 
-typedef std::pair<unsigned, AttributeSet> IndexAttrPair;
+using IndexAttrPair = std::pair<unsigned, AttributeSet>;
 
 //===----------------------------------------------------------------------===//
 /// \class
@@ -265,7 +268,8 @@ public:
     return AvailableFunctionAttrs & ((uint64_t)1) << Kind;
   }
 
-  typedef AttributeSet::iterator iterator;
+  using iterator = AttributeSet::iterator;
+
   iterator begin(unsigned Slot) const {
     return getSlotAttributes(Slot).begin();
   }
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 3b1140ab542c..ce60367a6c8b 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -34,6 +34,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
+#include <climits>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <map>
@@ -504,16 +506,74 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) {
   return AttributeSet(AttributeSetNode::get(C, Attrs));
 }
 
+AttributeSet AttributeSet::addAttribute(LLVMContext &C,
+                          Attribute::AttrKind Kind) const {
+  if (hasAttribute(Kind)) return *this;
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return addAttributes(C, AttributeSet::get(C, B));
+}
+
+AttributeSet AttributeSet::addAttribute(LLVMContext &C, StringRef Kind,
+                          StringRef Value) const {
+  AttrBuilder B;
+  B.addAttribute(Kind, Value);
+  return addAttributes(C, AttributeSet::get(C, B));
+}
+
+AttributeSet AttributeSet::addAttributes(LLVMContext &C,
+                                         const AttributeSet AS) const {
+  if (!hasAttributes())
+    return AS;
+
+  if (!AS.hasAttributes())
+    return *this;
+
+  AttrBuilder B(AS);
+  for (Attribute I : *this)
+    B.addAttribute(I);
+
+ return get(C, B);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
+                                             Attribute::AttrKind Kind) const {
+  if (!hasAttribute(Kind)) return *this;
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return removeAttributes(C, B);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
+                                             StringRef Kind) const {
+  if (!hasAttribute(Kind)) return *this;
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return removeAttributes(C, B);
+}
+
+AttributeSet AttributeSet::removeAttributes(LLVMContext &C,
+                                              const AttrBuilder &Attrs) const {
+
+  // FIXME it is not obvious how this should work for alignment.
+  // For now, say we can't pass in alignment, which no current use does.
+  assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!");
+
+  AttrBuilder B(*this);
+  B.remove(Attrs);
+  return get(C, B);
+}
+
 unsigned AttributeSet::getNumAttributes() const {
   return SetNode ? SetNode->getNumAttributes() : 0;
 }
 
 bool AttributeSet::hasAttribute(Attribute::AttrKind Kind) const {
-  return SetNode ? SetNode->hasAttribute(Kind) : 0;
+  return SetNode ? SetNode->hasAttribute(Kind) : false;
 }
 
 bool AttributeSet::hasAttribute(StringRef Kind) const {
-  return SetNode ? SetNode->hasAttribute(Kind) : 0;
+  return SetNode ? SetNode->hasAttribute(Kind) : false;
 }
 
 Attribute AttributeSet::getAttribute(Attribute::AttrKind Kind) const {
@@ -557,6 +617,14 @@ AttributeSet::iterator AttributeSet::end() const {
   return SetNode ? SetNode->end() : nullptr;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void AttributeSet::dump() const {
+  dbgs() << "AS =\n";
+    dbgs() << "  { ";
+    dbgs() << getAsString(true) << " }\n";
+}
+#endif
+
 //===----------------------------------------------------------------------===//
 // AttributeSetNode Definition
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 80b117015ede..a20f3f811c8d 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -2041,9 +2041,6 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
                                           Optional<unsigned> InRangeIndex,
                                           ArrayRef<Value *> Idxs) {
   if (Idxs.empty()) return C;
-  Constant *Idx0 = cast<Constant>(Idxs[0]);
-  if ((Idxs.size() == 1 && Idx0->isNullValue()))
-    return C;
 
   if (isa<UndefValue>(C)) {
     Type *GEPTy = GetElementPtrInst::getGEPReturnType(
@@ -2051,10 +2048,15 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     return UndefValue::get(GEPTy);
   }
 
+  Constant *Idx0 = cast<Constant>(Idxs[0]);
+  if (Idxs.size() == 1 && (Idx0->isNullValue() || isa<UndefValue>(Idx0)))
+    return C;
+
   if (C->isNullValue()) {
     bool isNull = true;
     for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
-      if (!cast<Constant>(Idxs[i])->isNullValue()) {
+      if (!isa<UndefValue>(Idxs[i]) &&
+          !cast<Constant>(Idxs[i])->isNullValue()) {
         isNull = false;
         break;
       }
diff --git a/lib/IR/ConstantRange.cpp b/lib/IR/ConstantRange.cpp
index aeb1257754f3..509caba3acd4 100644
--- a/lib/IR/ConstantRange.cpp
+++ b/lib/IR/ConstantRange.cpp
@@ -278,7 +278,7 @@ APInt ConstantRange::getUnsignedMax() const {
 }
 
 APInt ConstantRange::getUnsignedMin() const {
-  if (isFullSet() || (isWrappedSet() && getUpper() != 0))
+  if (isFullSet() || (isWrappedSet() && !getUpper().isNullValue()))
     return APInt::getMinValue(getBitWidth());
   return getLower();
 }
@@ -442,7 +442,7 @@ ConstantRange ConstantRange::unionWith(const ConstantRange &CR) const {
     APInt L = CR.Lower.ult(Lower) ? CR.Lower : Lower;
     APInt U = (CR.Upper - 1).ugt(Upper - 1) ? CR.Upper : Upper;
 
-    if (L == 0 && U == 0)
+    if (L.isNullValue() && U.isNullValue())
       return ConstantRange(getBitWidth());
 
     return ConstantRange(std::move(L), std::move(U));
@@ -757,7 +757,8 @@ ConstantRange::multiply(const ConstantRange &Other) const {
   // from one positive number to another which is as good as we can generate.
   // In this case, skip the extra work of generating signed ranges which aren't
   // going to be better than this range.
-  if (!UR.isWrappedSet() && UR.getLower().isNonNegative())
+  if (!UR.isWrappedSet() &&
+      (UR.getUpper().isNonNegative() || UR.getUpper().isMinSignedValue()))
     return UR;
 
   // Now the signed range. Because we could be dealing with negative numbers
@@ -834,7 +835,7 @@ ConstantRange::umin(const ConstantRange &Other) const {
 
 ConstantRange
 ConstantRange::udiv(const ConstantRange &RHS) const {
-  if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax() == 0)
+  if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax().isNullValue())
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
   if (RHS.isFullSet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
@@ -842,7 +843,7 @@ ConstantRange::udiv(const ConstantRange &RHS) const {
   APInt Lower = getUnsignedMin().udiv(RHS.getUnsignedMax());
 
   APInt RHS_umin = RHS.getUnsignedMin();
-  if (RHS_umin == 0) {
+  if (RHS_umin.isNullValue()) {
     // We want the lowest value in RHS excluding zero. Usually that would be 1
     // except for a range in the form of [X, 1) in which case it would be X.
     if (RHS.getUpper() == 1)
@@ -892,29 +893,33 @@ ConstantRange::shl(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
 
-  APInt min = getUnsignedMin().shl(Other.getUnsignedMin());
-  APInt max = getUnsignedMax().shl(Other.getUnsignedMax());
+  APInt max = getUnsignedMax();
+  APInt Other_umax = Other.getUnsignedMax();
 
-  // there's no overflow!
-  APInt Zeros(getBitWidth(), getUnsignedMax().countLeadingZeros());
-  if (Zeros.ugt(Other.getUnsignedMax()))
-    return ConstantRange(std::move(min), std::move(max) + 1);
+  // there's overflow!
+  if (Other_umax.uge(max.countLeadingZeros()))
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
   // FIXME: implement the other tricky cases
-  return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  APInt min = getUnsignedMin();
+  min <<= Other.getUnsignedMin();
+  max <<= Other_umax;
+
+  return ConstantRange(std::move(min), std::move(max) + 1);
 }
 
 ConstantRange
 ConstantRange::lshr(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
-  
-  APInt max = getUnsignedMax().lshr(Other.getUnsignedMin());
+
+  APInt max = getUnsignedMax().lshr(Other.getUnsignedMin()) + 1;
   APInt min = getUnsignedMin().lshr(Other.getUnsignedMax());
-  if (min == max + 1)
+  if (min == max)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  return ConstantRange(std::move(min), std::move(max) + 1);
+  return ConstantRange(std::move(min), std::move(max));
 }
 
 ConstantRange ConstantRange::inverse() const {
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index ffc8f2e4303b..4b9d89cda539 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -30,7 +30,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <cstdarg>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -966,16 +966,6 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
   return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V);
 }
 
-Constant *ConstantStruct::get(StructType *T, ...) {
-  va_list ap;
-  SmallVector<Constant*, 8> Values;
-  va_start(ap, T);
-  while (Constant *Val = va_arg(ap, llvm::Constant*))
-    Values.push_back(Val);
-  va_end(ap);
-  return get(T, Values);
-}
-
 ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
     : ConstantAggregate(T, ConstantVectorVal, V) {
   assert(V.size() == T->getNumElements() &&
@@ -1810,8 +1800,7 @@ Constant *ConstantExpr::getSizeOf(Type* Ty) {
 Constant *ConstantExpr::getAlignOf(Type* Ty) {
   // alignof is implemented as: (i64) gep ({i1,Ty}*)null, 0, 1
   // Note that a non-inbounds gep is used, as null isn't within any object.
-  Type *AligningTy = 
-    StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, nullptr);
+  Type *AligningTy = StructType::get(Type::getInt1Ty(Ty->getContext()), Ty);
   Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo(0));
   Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0);
   Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index eda751d8af4a..25eb9452d9d0 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
@@ -387,31 +388,34 @@ struct ConstantExprKeyType;
 
 template <class ConstantClass> struct ConstantInfo;
 template <> struct ConstantInfo<ConstantExpr> {
-  typedef ConstantExprKeyType ValType;
-  typedef Type TypeClass;
+  using ValType = ConstantExprKeyType;
+  using TypeClass = Type;
 };
 template <> struct ConstantInfo<InlineAsm> {
-  typedef InlineAsmKeyType ValType;
-  typedef PointerType TypeClass;
+  using ValType = InlineAsmKeyType;
+  using TypeClass = PointerType;
 };
 template <> struct ConstantInfo<ConstantArray> {
-  typedef ConstantAggrKeyType<ConstantArray> ValType;
-  typedef ArrayType TypeClass;
+  using ValType = ConstantAggrKeyType<ConstantArray>;
+  using TypeClass = ArrayType;
 };
 template <> struct ConstantInfo<ConstantStruct> {
-  typedef ConstantAggrKeyType<ConstantStruct> ValType;
-  typedef StructType TypeClass;
+  using ValType = ConstantAggrKeyType<ConstantStruct>;
+  using TypeClass = StructType;
 };
 template <> struct ConstantInfo<ConstantVector> {
-  typedef ConstantAggrKeyType<ConstantVector> ValType;
-  typedef VectorType TypeClass;
+  using ValType = ConstantAggrKeyType<ConstantVector>;
+  using TypeClass = VectorType;
 };
 
 template <class ConstantClass> struct ConstantAggrKeyType {
   ArrayRef<Constant *> Operands;
+
   ConstantAggrKeyType(ArrayRef<Constant *> Operands) : Operands(Operands) {}
+
   ConstantAggrKeyType(ArrayRef<Constant *> Operands, const ConstantClass *)
       : Operands(Operands) {}
+
   ConstantAggrKeyType(const ConstantClass *C,
                       SmallVectorImpl<Constant *> &Storage) {
     assert(Storage.empty() && "Expected empty storage");
@@ -437,7 +441,8 @@ template <class ConstantClass> struct ConstantAggrKeyType {
     return hash_combine_range(Operands.begin(), Operands.end());
   }
 
-  typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
+  using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
+
   ConstantClass *create(TypeClass *Ty) const {
     return new (Operands.size()) ConstantClass(Ty, Operands);
   }
@@ -457,6 +462,7 @@ struct InlineAsmKeyType {
       : AsmString(AsmString), Constraints(Constraints), FTy(FTy),
         HasSideEffects(HasSideEffects), IsAlignStack(IsAlignStack),
         AsmDialect(AsmDialect) {}
+
   InlineAsmKeyType(const InlineAsm *Asm, SmallVectorImpl<Constant *> &)
       : AsmString(Asm->getAsmString()), Constraints(Asm->getConstraintString()),
         FTy(Asm->getFunctionType()), HasSideEffects(Asm->hasSideEffects()),
@@ -483,7 +489,8 @@ struct InlineAsmKeyType {
                         AsmDialect, FTy);
   }
 
-  typedef ConstantInfo<InlineAsm>::TypeClass TypeClass;
+  using TypeClass = ConstantInfo<InlineAsm>::TypeClass;
+
   InlineAsm *create(TypeClass *Ty) const {
     assert(PointerType::getUnqual(FTy) == Ty);
     return new InlineAsm(FTy, AsmString, Constraints, HasSideEffects,
@@ -507,11 +514,13 @@ struct ConstantExprKeyType {
       : Opcode(Opcode), SubclassOptionalData(SubclassOptionalData),
         SubclassData(SubclassData), Ops(Ops), Indexes(Indexes),
         ExplicitTy(ExplicitTy) {}
+
   ConstantExprKeyType(ArrayRef<Constant *> Operands, const ConstantExpr *CE)
       : Opcode(CE->getOpcode()),
         SubclassOptionalData(CE->getRawSubclassOptionalData()),
         SubclassData(CE->isCompare() ? CE->getPredicate() : 0), Ops(Operands),
         Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()) {}
+
   ConstantExprKeyType(const ConstantExpr *CE,
                       SmallVectorImpl<Constant *> &Storage)
       : Opcode(CE->getOpcode()),
@@ -553,7 +562,8 @@ struct ConstantExprKeyType {
                         hash_combine_range(Indexes.begin(), Indexes.end()));
   }
 
-  typedef ConstantInfo<ConstantExpr>::TypeClass TypeClass;
+  using TypeClass = ConstantInfo<ConstantExpr>::TypeClass;
+
   ConstantExpr *create(TypeClass *Ty) const {
     switch (Opcode) {
     default:
@@ -594,16 +604,17 @@ struct ConstantExprKeyType {
 
 template <class ConstantClass> class ConstantUniqueMap {
 public:
-  typedef typename ConstantInfo<ConstantClass>::ValType ValType;
-  typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
-  typedef std::pair<TypeClass *, ValType> LookupKey;
+  using ValType = typename ConstantInfo<ConstantClass>::ValType;
+  using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
+  using LookupKey = std::pair<TypeClass *, ValType>;
 
   /// Key and hash together, so that we compute the hash only once and reuse it.
-  typedef std::pair<unsigned, LookupKey> LookupKeyHashed;
+  using LookupKeyHashed = std::pair<unsigned, LookupKey>;
 
 private:
   struct MapInfo {
-    typedef DenseMapInfo<ConstantClass *> ConstantClassInfo;
+    using ConstantClassInfo = DenseMapInfo<ConstantClass *>;
+
     static inline ConstantClass *getEmptyKey() {
       return ConstantClassInfo::getEmptyKey();
     }
@@ -643,7 +654,7 @@ private:
   };
 
 public:
-  typedef DenseSet<ConstantClass *, MapInfo> MapTy;
+  using MapTy = DenseSet<ConstantClass *, MapInfo>;
 
 private:
   MapTy Map;
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index cdbe237766a3..e6c49cad0722 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -672,6 +672,24 @@ void DIExpression::appendOffset(SmallVectorImpl<uint64_t> &Ops,
   }
 }
 
+bool DIExpression::extractIfOffset(int64_t &Offset) const {
+  if (getNumElements() == 0) {
+    Offset = 0;
+    return true;
+  }
+  if (getNumElements() != 2)
+    return false;
+  if (Elements[0] == dwarf::DW_OP_plus) {
+    Offset = Elements[1];
+    return true;
+  }
+  if (Elements[0] == dwarf::DW_OP_minus) {
+    Offset = -Elements[1];
+    return true;
+  }
+  return false;
+}
+
 DIExpression *DIExpression::prepend(const DIExpression *Expr, bool Deref,
                                     int64_t Offset, bool StackValue) {
   SmallVector<uint64_t, 8> Ops;
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index f31074a7ad44..3168ec6944a3 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "LLVMContextImpl.h"
 #include "llvm/IR/DebugInfo.h"
 using namespace llvm;
@@ -66,6 +67,119 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col, const MDNode *Scope,
                          const_cast<MDNode *>(InlinedAt));
 }
 
+DebugLoc DebugLoc::appendInlinedAt(DebugLoc DL, DILocation *InlinedAt,
+                                   LLVMContext &Ctx,
+                                   DenseMap<const MDNode *, MDNode *> &Cache,
+                                   bool ReplaceLast) {
+  SmallVector<DILocation *, 3> InlinedAtLocations;
+  DILocation *Last = InlinedAt;
+  DILocation *CurInlinedAt = DL;
+
+  // Gather all the inlined-at nodes.
+  while (DILocation *IA = CurInlinedAt->getInlinedAt()) {
+    // Skip any we've already built nodes for.
+    if (auto *Found = Cache[IA]) {
+      Last = cast<DILocation>(Found);
+      break;
+    }
+
+    if (ReplaceLast && !IA->getInlinedAt())
+      break;
+    InlinedAtLocations.push_back(IA);
+    CurInlinedAt = IA;
+  }
+
+  // Starting from the top, rebuild the nodes to point to the new inlined-at
+  // location (then rebuilding the rest of the chain behind it) and update the
+  // map of already-constructed inlined-at nodes.
+  for (const DILocation *MD : reverse(InlinedAtLocations))
+    Cache[MD] = Last = DILocation::getDistinct(
+        Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
+
+  return Last;
+}
+
+/// Reparent \c Scope from \c OrigSP to \c NewSP.
+static DIScope *reparentScope(LLVMContext &Ctx, DIScope *Scope,
+                              DISubprogram *OrigSP, DISubprogram *NewSP,
+                              DenseMap<const MDNode *, MDNode *> &Cache) {
+  SmallVector<DIScope *, 3> ScopeChain;
+  DIScope *Last = NewSP;
+  DIScope *CurScope = Scope;
+  do {
+    if (auto *SP = dyn_cast<DISubprogram>(CurScope)) {
+      // Don't rewrite this scope chain if it doesn't lead to the replaced SP.
+      if (SP != OrigSP)
+        return Scope;
+      Cache.insert({OrigSP, NewSP});
+      break;
+    }
+    if (auto *Found = Cache[CurScope]) {
+      Last = cast<DIScope>(Found);
+      break;
+    }
+    ScopeChain.push_back(CurScope);
+  } while ((CurScope = CurScope->getScope().resolve()));
+
+  // Starting from the top, rebuild the nodes to point to the new inlined-at
+  // location (then rebuilding the rest of the chain behind it) and update the
+  // map of already-constructed inlined-at nodes.
+  for (const DIScope *MD : reverse(ScopeChain)) {
+    if (auto *LB = dyn_cast<DILexicalBlock>(MD))
+      Cache[MD] = Last = DILexicalBlock::getDistinct(
+          Ctx, Last, LB->getFile(), LB->getLine(), LB->getColumn());
+    else if (auto *LB = dyn_cast<DILexicalBlockFile>(MD))
+      Cache[MD] = Last = DILexicalBlockFile::getDistinct(
+          Ctx, Last, LB->getFile(), LB->getDiscriminator());
+    else
+      llvm_unreachable("illegal parent scope");
+  }
+  return Last;
+}
+
+void DebugLoc::reparentDebugInfo(Instruction &I, DISubprogram *OrigSP,
+                                 DISubprogram *NewSP,
+                                 DenseMap<const MDNode *, MDNode *> &Cache) {
+  auto DL = I.getDebugLoc();
+  if (!OrigSP || !NewSP || OrigSP == NewSP || !DL)
+    return;
+
+  // Reparent the debug location.
+  auto &Ctx = I.getContext();
+  DILocation *InlinedAt = DL->getInlinedAt();
+  if (InlinedAt) {
+    while (auto *IA = InlinedAt->getInlinedAt())
+      InlinedAt = IA;
+    auto NewScope =
+        reparentScope(Ctx, InlinedAt->getScope(), OrigSP, NewSP, Cache);
+    InlinedAt =
+        DebugLoc::get(InlinedAt->getLine(), InlinedAt->getColumn(), NewScope);
+  }
+  I.setDebugLoc(
+      DebugLoc::get(DL.getLine(), DL.getCol(),
+                    reparentScope(Ctx, DL->getScope(), OrigSP, NewSP, Cache),
+                    DebugLoc::appendInlinedAt(DL, InlinedAt, Ctx, Cache,
+                                              ReplaceLastInlinedAt)));
+
+  // Fix up debug variables to point to NewSP.
+  auto reparentVar = [&](DILocalVariable *Var) {
+    return DILocalVariable::getDistinct(
+        Ctx,
+        cast<DILocalScope>(
+            reparentScope(Ctx, Var->getScope(), OrigSP, NewSP, Cache)),
+        Var->getName(), Var->getFile(), Var->getLine(), Var->getType(),
+        Var->getArg(), Var->getFlags(), Var->getAlignInBits());
+  };
+  if (auto *DbgValue = dyn_cast<DbgValueInst>(&I)) {
+    auto *Var = DbgValue->getVariable();
+    I.setOperand(2, MetadataAsValue::get(Ctx, reparentVar(Var)));
+  } else if (auto *DbgDeclare = dyn_cast<DbgDeclareInst>(&I)) {
+    auto *Var = DbgDeclare->getVariable();
+    I.setOperand(1, MetadataAsValue::get(Ctx, reparentVar(Var)));
+  }
+}
+
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void DebugLoc::dump() const {
   if (!Loc)
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 395b6158e0c8..e73f53f3202d 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -12,20 +12,31 @@
 // Diagnostics reporting is still done as part of the LLVMContext.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/DiagnosticInfo.h"
-#include "LLVMContextImpl.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Regex.h"
 #include <atomic>
+#include <cassert>
+#include <memory>
 #include <string>
 
 using namespace llvm;
@@ -53,6 +64,8 @@ struct PassRemarksOpt {
   }
 };
 
+} // end anonymous namespace
+
 static PassRemarksOpt PassRemarksOptLoc;
 static PassRemarksOpt PassRemarksMissedOptLoc;
 static PassRemarksOpt PassRemarksAnalysisOptLoc;
@@ -85,7 +98,6 @@ PassRemarksAnalysis(
         "the given regular expression"),
     cl::Hidden, cl::location(PassRemarksAnalysisOptLoc), cl::ValueRequired,
     cl::ZeroOrMore);
-}
 
 int llvm::getNextAvailablePluginDiagnosticKind() {
   static std::atomic<int> PluginKindID(DK_FirstPluginKind);
@@ -97,8 +109,7 @@ const char *OptimizationRemarkAnalysis::AlwaysPrint = "";
 DiagnosticInfoInlineAsm::DiagnosticInfoInlineAsm(const Instruction &I,
                                                  const Twine &MsgStr,
                                                  DiagnosticSeverity Severity)
-    : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(0), MsgStr(MsgStr),
-      Instr(&I) {
+    : DiagnosticInfo(DK_InlineAsm, Severity), MsgStr(MsgStr), Instr(&I) {
   if (const MDNode *SrcLoc = I.getMetadata("srcloc")) {
     if (SrcLoc->getNumOperands() != 0)
       if (const auto *CI =
@@ -193,7 +204,7 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V
   // Only include names that correspond to user variables.  FIXME: we should use
   // debug info if available to get the name of the user variable.
   if (isa<llvm::Argument>(V) || isa<GlobalValue>(V))
-    Val = GlobalValue::getRealLinkageName(V->getName());
+    Val = GlobalValue::dropLLVMManglingEscape(V->getName());
   else if (isa<Constant>(V)) {
     raw_string_ostream OS(Val);
     V->printAsOperand(OS, /*PrintType=*/false);
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 58c060550322..16a9e51b8306 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -1,4 +1,4 @@
-//===-- Function.cpp - Implement the Global object classes ----------------===//
+//===- Function.cpp - Implement the Global object classes -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,21 +11,51 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Function.h"
 #include "LLVMContextImpl.h"
 #include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
 using namespace llvm;
 
 // Explicit instantiations of SymbolTableListTraits since some of the methods
@@ -36,7 +66,7 @@ template class llvm::SymbolTableListTraits<BasicBlock>;
 // Argument Implementation
 //===----------------------------------------------------------------------===//
 
-void Argument::anchor() { }
+void Argument::anchor() {}
 
 Argument::Argument(Type *Ty, const Twine &Name, Function *Par, unsigned ArgNo)
     : Value(Ty, Value::ArgumentVal), Parent(Par), ArgNo(ArgNo) {
@@ -186,7 +216,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name,
                    Module *ParentModule)
     : GlobalObject(Ty, Value::FunctionVal,
                    OperandTraits<Function>::op_begin(this), 0, Linkage, name),
-      Arguments(nullptr), NumArgs(Ty->getNumParams()) {
+      NumArgs(Ty->getNumParams()) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
@@ -386,24 +416,20 @@ void Function::clearGC() {
 
 /// Copy all additional attributes (those not needed to create a Function) from
 /// the Function Src to this one.
-void Function::copyAttributesFrom(const GlobalValue *Src) {
+void Function::copyAttributesFrom(const Function *Src) {
   GlobalObject::copyAttributesFrom(Src);
-  const Function *SrcF = dyn_cast<Function>(Src);
-  if (!SrcF)
-    return;
-
-  setCallingConv(SrcF->getCallingConv());
-  setAttributes(SrcF->getAttributes());
-  if (SrcF->hasGC())
-    setGC(SrcF->getGC());
+  setCallingConv(Src->getCallingConv());
+  setAttributes(Src->getAttributes());
+  if (Src->hasGC())
+    setGC(Src->getGC());
   else
     clearGC();
-  if (SrcF->hasPersonalityFn())
-    setPersonalityFn(SrcF->getPersonalityFn());
-  if (SrcF->hasPrefixData())
-    setPrefixData(SrcF->getPrefixData());
-  if (SrcF->hasPrologueData())
-    setPrologueData(SrcF->getPrologueData());
+  if (Src->hasPersonalityFn())
+    setPersonalityFn(Src->getPersonalityFn());
+  if (Src->hasPrefixData())
+    setPrefixData(Src->getPrefixData());
+  if (Src->hasPrologueData())
+    setPrologueData(Src->getPrologueData());
 }
 
 /// Table of string intrinsic names indexed by enum value.
@@ -486,10 +512,10 @@ void Function::recalculateIntrinsicID() {
 static std::string getMangledTypeStr(Type* Ty) {
   std::string Result;
   if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
-    Result += "p" + llvm::utostr(PTyp->getAddressSpace()) +
+    Result += "p" + utostr(PTyp->getAddressSpace()) +
       getMangledTypeStr(PTyp->getElementType());
   } else if (ArrayType* ATyp = dyn_cast<ArrayType>(Ty)) {
-    Result += "a" + llvm::utostr(ATyp->getNumElements()) +
+    Result += "a" + utostr(ATyp->getNumElements()) +
       getMangledTypeStr(ATyp->getElementType());
   } else if (StructType *STyp = dyn_cast<StructType>(Ty)) {
     if (!STyp->isLiteral()) {
@@ -534,7 +560,6 @@ std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
   return Result;
 }
 
-
 /// IIT_Info - These are enumerators that describe the entries returned by the
 /// getIntrinsicInfoTableEntries function.
 ///
@@ -585,9 +610,10 @@ enum IIT_Info {
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
                       SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
+  using namespace Intrinsic;
+
   IIT_Info Info = IIT_Info(Infos[NextElt++]);
   unsigned StructElts = 2;
-  using namespace Intrinsic;
 
   switch (Info) {
   case IIT_Done:
@@ -742,7 +768,6 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   llvm_unreachable("unhandled");
 }
 
-
 #define GET_INTRINSIC_GENERATOR_GLOBAL
 #include "llvm/IR/Intrinsics.gen"
 #undef GET_INTRINSIC_GENERATOR_GLOBAL
@@ -780,10 +805,10 @@ void Intrinsic::getIntrinsicInfoTableEntries(ID id,
     DecodeIITType(NextElt, IITEntries, T);
 }
 
-
 static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
                              ArrayRef<Type*> Tys, LLVMContext &Context) {
   using namespace Intrinsic;
+
   IITDescriptor D = Infos.front();
   Infos = Infos.slice(1);
 
@@ -855,12 +880,10 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::VecOfAnyPtrsToElt:
     // Return the overloaded type (which determines the pointers address space)
     return Tys[D.getOverloadArgNumber()];
- }
+  }
   llvm_unreachable("unhandled");
 }
 
-
-
 FunctionType *Intrinsic::getType(LLVMContext &Context,
                                  ID id, ArrayRef<Type*> Tys) {
   SmallVector<IITDescriptor, 8> Table;
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 5f338f58d940..17d27b016cf2 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -69,6 +69,30 @@ void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
   setDLLStorageClass(Src->getDLLStorageClass());
 }
 
+void GlobalValue::removeFromParent() {
+  switch (getValueID()) {
+#define HANDLE_GLOBAL_VALUE(NAME)                                              \
+  case Value::NAME##Val:                                                       \
+    return static_cast<NAME *>(this)->removeFromParent();
+#include "llvm/IR/Value.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a global");
+}
+
+void GlobalValue::eraseFromParent() {
+  switch (getValueID()) {
+#define HANDLE_GLOBAL_VALUE(NAME)                                              \
+  case Value::NAME##Val:                                                       \
+    return static_cast<NAME *>(this)->eraseFromParent();
+#include "llvm/IR/Value.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a global");
+}
+
 unsigned GlobalValue::getAlignment() const {
   if (auto *GA = dyn_cast<GlobalAlias>(this)) {
     // In general we cannot compute this at the IR level, but we try.
@@ -93,12 +117,10 @@ void GlobalObject::setAlignment(unsigned Align) {
   assert(getAlignment() == Align && "Alignment representation error!");
 }
 
-void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
+void GlobalObject::copyAttributesFrom(const GlobalObject *Src) {
   GlobalValue::copyAttributesFrom(Src);
-  if (const auto *GV = dyn_cast<GlobalObject>(Src)) {
-    setAlignment(GV->getAlignment());
-    setSection(GV->getSection());
-  }
+  setAlignment(Src->getAlignment());
+  setSection(Src->getSection());
 }
 
 std::string GlobalValue::getGlobalIdentifier(StringRef Name,
@@ -233,7 +255,7 @@ bool GlobalValue::canIncreaseAlignment() const {
 const GlobalObject *GlobalValue::getBaseObject() const {
   if (auto *GO = dyn_cast<GlobalObject>(this))
     return GO;
-  if (auto *GA = dyn_cast<GlobalAlias>(this))
+  if (auto *GA = dyn_cast<GlobalIndirectSymbol>(this))
     return GA->getBaseObject();
   return nullptr;
 }
@@ -333,12 +355,11 @@ void GlobalVariable::setInitializer(Constant *InitVal) {
 
 /// Copy all additional attributes (those not needed to create a GlobalVariable)
 /// from the GlobalVariable Src to this one.
-void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
+void GlobalVariable::copyAttributesFrom(const GlobalVariable *Src) {
   GlobalObject::copyAttributesFrom(Src);
-  if (const GlobalVariable *SrcVar = dyn_cast<GlobalVariable>(Src)) {
-    setThreadLocalMode(SrcVar->getThreadLocalMode());
-    setExternallyInitialized(SrcVar->isExternallyInitialized());
-  }
+  setThreadLocalMode(Src->getThreadLocalMode());
+  setExternallyInitialized(Src->isExternallyInitialized());
+  setAttributes(Src->getAttributes());
 }
 
 void GlobalVariable::dropAllReferences() {
diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp
index e265a823687f..3477c087967f 100644
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@@ -161,6 +161,94 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
   return CI;  
 }
 
+static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID,
+                                    Value *Src) {
+  Module *M = Builder->GetInsertBlock()->getParent()->getParent();
+  Value *Ops[] = {Src};
+  Type *Tys[] = { Src->getType()->getVectorElementType(), Src->getType() };
+  auto Decl = Intrinsic::getDeclaration(M, ID, Tys);
+  return createCallHelper(Decl, Ops, Builder);
+}
+
+CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
+  Module *M = GetInsertBlock()->getParent()->getParent();
+  Value *Ops[] = {Acc, Src};
+  Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+                 Src->getType()};
+  auto Decl = Intrinsic::getDeclaration(
+      M, Intrinsic::experimental_vector_reduce_fadd, Tys);
+  return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
+  Module *M = GetInsertBlock()->getParent()->getParent();
+  Value *Ops[] = {Acc, Src};
+  Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+                 Src->getType()};
+  auto Decl = Intrinsic::getDeclaration(
+      M, Intrinsic::experimental_vector_reduce_fmul, Tys);
+  return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
+  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax
+                     : Intrinsic::experimental_vector_reduce_umax;
+  return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
+  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin
+                     : Intrinsic::experimental_vector_reduce_umin;
+  return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) {
+  auto Rdx = getReductionIntrinsic(
+      this, Intrinsic::experimental_vector_reduce_fmax, Src);
+  if (NoNaN) {
+    FastMathFlags FMF;
+    FMF.setNoNaNs();
+    Rdx->setFastMathFlags(FMF);
+  }
+  return Rdx;
+}
+
+CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) {
+  auto Rdx = getReductionIntrinsic(
+      this, Intrinsic::experimental_vector_reduce_fmin, Src);
+  if (NoNaN) {
+    FastMathFlags FMF;
+    FMF.setNoNaNs();
+    Rdx->setFastMathFlags(FMF);
+  }
+  return Rdx;
+}
+
 CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
   assert(isa<PointerType>(Ptr->getType()) &&
          "lifetime.start only applies to pointers.");
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 906a28a5c887..91b9d9232b54 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -534,6 +534,30 @@ bool Instruction::isAtomic() const {
   }
 }
 
+bool Instruction::hasAtomicLoad() const {
+  assert(isAtomic());
+  switch (getOpcode()) {
+  default:
+    return false;
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
+  case Instruction::Load:
+    return true;
+  }
+}
+
+bool Instruction::hasAtomicStore() const {
+  assert(isAtomic());
+  switch (getOpcode()) {
+  default:
+    return false;
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
+  case Instruction::Store:
+    return true;
+  }
+}
+
 bool Instruction::mayThrow() const {
   if (const CallInst *CI = dyn_cast<CallInst>(this))
     return !CI->doesNotThrow();
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index a60cc375d568..5a5b9c0d06bb 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -1,4 +1,4 @@
-//===-- Instructions.cpp - Implement the LLVM instructions ----------------===//
+//===- Instructions.cpp - Implement the LLVM instructions -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +12,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Instructions.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
-#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -42,7 +60,42 @@ User::op_iterator CallSite::getCallee() const {
 //===----------------------------------------------------------------------===//
 
 // Out of line virtual method, so the vtable, etc has a home.
-TerminatorInst::~TerminatorInst() {
+TerminatorInst::~TerminatorInst() = default;
+
+unsigned TerminatorInst::getNumSuccessors() const {
+  switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
+  case Instruction::OPC:                                                       \
+    return static_cast<const CLASS *>(this)->getNumSuccessorsV();
+#include "llvm/IR/Instruction.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a terminator");
+}
+
+BasicBlock *TerminatorInst::getSuccessor(unsigned idx) const {
+  switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
+  case Instruction::OPC:                                                       \
+    return static_cast<const CLASS *>(this)->getSuccessorV(idx);
+#include "llvm/IR/Instruction.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a terminator");
+}
+
+void TerminatorInst::setSuccessor(unsigned idx, BasicBlock *B) {
+  switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
+  case Instruction::OPC:                                                       \
+    return static_cast<CLASS *>(this)->setSuccessorV(idx, B);
+#include "llvm/IR/Instruction.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a terminator");
 }
 
 //===----------------------------------------------------------------------===//
@@ -50,8 +103,7 @@ TerminatorInst::~TerminatorInst() {
 //===----------------------------------------------------------------------===//
 
 // Out of line virtual method, so the vtable, etc has a home.
-UnaryInstruction::~UnaryInstruction() {
-}
+UnaryInstruction::~UnaryInstruction() = default;
 
 //===----------------------------------------------------------------------===//
 //                              SelectInst Class
@@ -82,7 +134,6 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
   return nullptr;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                               PHINode Class
 //===----------------------------------------------------------------------===//
@@ -242,8 +293,7 @@ void LandingPadInst::addClause(Constant *Val) {
 //                        CallInst Implementation
 //===----------------------------------------------------------------------===//
 
-CallInst::~CallInst() {
-}
+CallInst::~CallInst() = default;
 
 void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
                     ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) {
@@ -541,7 +591,6 @@ Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
                       ArraySize, OpB, MallocF, Name);
 }
 
-
 /// CreateMalloc - Generate the IR for a call to malloc:
 /// 1. Compute the malloc call's argument as the specified type's size,
 ///    possibly multiplied by the array size if the array size is not
@@ -692,9 +741,11 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
 BasicBlock *InvokeInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
+
 unsigned InvokeInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   return setSuccessor(idx, B);
 }
@@ -821,6 +872,7 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, Instruction *InsertBefore)
   if (retVal)
     Op<0>() = retVal;
 }
+
 ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
   : TerminatorInst(Type::getVoidTy(C), Instruction::Ret,
                    OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
@@ -828,6 +880,7 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
   if (retVal)
     Op<0>() = retVal;
 }
+
 ReturnInst::ReturnInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
   : TerminatorInst(Type::getVoidTy(Context), Instruction::Ret,
                    OperandTraits<ReturnInst>::op_end(this), 0, InsertAtEnd) {
@@ -847,8 +900,7 @@ BasicBlock *ReturnInst::getSuccessorV(unsigned idx) const {
   llvm_unreachable("ReturnInst has no successors!");
 }
 
-ReturnInst::~ReturnInst() {
-}
+ReturnInst::~ReturnInst() = default;
 
 //===----------------------------------------------------------------------===//
 //                        ResumeInst Implementation
@@ -930,9 +982,11 @@ BasicBlock *CleanupReturnInst::getSuccessorV(unsigned Idx) const {
   assert(Idx == 0);
   return getUnwindDest();
 }
+
 unsigned CleanupReturnInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void CleanupReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
   assert(Idx == 0);
   setUnwindDest(B);
@@ -973,9 +1027,11 @@ BasicBlock *CatchReturnInst::getSuccessorV(unsigned Idx) const {
   assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
   return getSuccessor();
 }
+
 unsigned CatchReturnInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void CatchReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
   assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
   setSuccessor(B);
@@ -1067,9 +1123,11 @@ void CatchSwitchInst::removeHandler(handler_iterator HI) {
 BasicBlock *CatchSwitchInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
+
 unsigned CatchSwitchInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void CatchSwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   setSuccessor(idx, B);
 }
@@ -1155,6 +1213,7 @@ BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
   assert(IfTrue && "Branch destination may not be null!");
   Op<-1>() = IfTrue;
 }
+
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
                        Instruction *InsertBefore)
   : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
@@ -1189,7 +1248,6 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
 #endif
 }
 
-
 BranchInst::BranchInst(const BranchInst &BI) :
   TerminatorInst(Type::getVoidTy(BI.getContext()), Instruction::Br,
                  OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
@@ -1216,14 +1274,15 @@ void BranchInst::swapSuccessors() {
 BasicBlock *BranchInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
+
 unsigned BranchInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void BranchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   setSuccessor(idx, B);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                        AllocaInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1279,8 +1338,7 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
 }
 
 // Out of line virtual method, so the vtable, etc has a home.
-AllocaInst::~AllocaInst() {
-}
+AllocaInst::~AllocaInst() = default;
 
 void AllocaInst::setAlignment(unsigned Align) {
   assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
@@ -1543,8 +1601,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      SynchronizationScope SynchScope,
                                      Instruction *InsertBefore)
     : Instruction(
-          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
-                          nullptr),
+          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
           AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
           OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) {
   Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
@@ -1556,8 +1613,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      SynchronizationScope SynchScope,
                                      BasicBlock *InsertAtEnd)
     : Instruction(
-          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
-                          nullptr),
+          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
           AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
           OperandTraits<AtomicCmpXchgInst>::operands(this), InsertAtEnd) {
   Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
@@ -1771,14 +1827,12 @@ ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
   setName(Name);
 }
 
-
 bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) {
   if (!Val->getType()->isVectorTy() || !Index->getType()->isIntegerTy())
     return false;
   return true;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                           InsertElementInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1825,7 +1879,6 @@ bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt,
   return true;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                      ShuffleVectorInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1938,7 +1991,6 @@ void ShuffleVectorInst::getShuffleMask(Constant *Mask,
   }
 }
 
-
 //===----------------------------------------------------------------------===//
 //                             InsertValueInst Class
 //===----------------------------------------------------------------------===//
@@ -1951,7 +2003,7 @@ void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
   // (other than weirdness with &*IdxBegin being invalid; see
   // getelementptr's init routine for example). But there's no
   // present need to support it.
-  assert(Idxs.size() > 0 && "InsertValueInst must have at least one index");
+  assert(!Idxs.empty() && "InsertValueInst must have at least one index");
 
   assert(ExtractValueInst::getIndexedType(Agg->getType(), Idxs) ==
          Val->getType() && "Inserted value must match indexed type!");
@@ -1980,7 +2032,7 @@ void ExtractValueInst::init(ArrayRef<unsigned> Idxs, const Twine &Name) {
 
   // There's no fundamental reason why we require at least one index.
   // But there's no present need to support it.
-  assert(Idxs.size() > 0 && "ExtractValueInst must have at least one index");
+  assert(!Idxs.empty() && "ExtractValueInst must have at least one index");
 
   Indices.append(Idxs.begin(), Idxs.end());
   setName(Name);
@@ -2053,7 +2105,6 @@ BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
   setName(Name);
 }
 
-
 void BinaryOperator::init(BinaryOps iType) {
   Value *LHS = getOperand(0), *RHS = getOperand(1);
   (void)LHS; (void)RHS; // Silence warnings.
@@ -2213,7 +2264,6 @@ BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
                             Op->getType(), Name, InsertAtEnd);
 }
 
-
 // isConstantAllOnes - Helper function for several functions below
 static inline bool isConstantAllOnes(const Value *V) {
   if (const Constant *C = dyn_cast<Constant>(V))
@@ -2279,7 +2329,6 @@ const Value *BinaryOperator::getNotArgument(const Value *BinOp) {
   return getNotArgument(const_cast<Value*>(BinOp));
 }
 
-
 // Exchange the two operands to this instruction. This instruction is safe to
 // use on any binary instruction and does not modify the semantics of the
 // instruction. If the instruction is order-dependent (SetLT f.e.), the opcode
@@ -2291,7 +2340,6 @@ bool BinaryOperator::swapOperands() {
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                             FPMathOperator Class
 //===----------------------------------------------------------------------===//
@@ -2305,7 +2353,6 @@ float FPMathOperator::getFPAccuracy() const {
   return Accuracy->getValueAPF().convertToFloat();
 }
 
-
 //===----------------------------------------------------------------------===//
 //                                CastInst Class
 //===----------------------------------------------------------------------===//
@@ -2567,13 +2614,12 @@ unsigned CastInst::isEliminableCastPair(
         return Instruction::BitCast;
       return 0;
     }
-    case 12: {
+    case 12:
       // addrspacecast, addrspacecast -> bitcast,       if SrcAS == DstAS
       // addrspacecast, addrspacecast -> addrspacecast, if SrcAS != DstAS
       if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace())
         return Instruction::AddrSpaceCast;
       return Instruction::BitCast;
-    }
     case 13:
       // FIXME: this state can be merged with (1), but the following assert
       // is useful to check the correcteness of the sequence due to semantic
@@ -2594,7 +2640,6 @@ unsigned CastInst::isEliminableCastPair(
           DstTy->getScalarType()->getPointerElementType())
         return Instruction::AddrSpaceCast;
       return 0;
-
     case 15:
       // FIXME: this state can be merged with (1), but the following assert
       // is useful to check the correcteness of the sequence due to semantic
@@ -3070,7 +3115,6 @@ CastInst::getCastOpcode(
 /// of the types involved.
 bool 
 CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
-
   // Check for type sanity on the arguments
   Type *SrcTy = S->getType();
 
@@ -3419,7 +3463,6 @@ bool CmpInst::isEquality() const {
   return cast<FCmpInst>(this)->isEquality();
 }
 
-
 CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) {
   switch (pred) {
     default: llvm_unreachable("Unknown cmp predicate!");
@@ -3743,9 +3786,11 @@ void SwitchInst::growOperands() {
 BasicBlock *SwitchInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
+
 unsigned SwitchInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void SwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   setSuccessor(idx, B);
 }
@@ -3832,9 +3877,11 @@ void IndirectBrInst::removeDestination(unsigned idx) {
 BasicBlock *IndirectBrInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
+
 unsigned IndirectBrInst::getNumSuccessorsV() const {
   return getNumSuccessors();
 }
+
 void IndirectBrInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   setSuccessor(idx, B);
 }
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 628a67bd639c..b2b12289f871 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManagers.h"
@@ -465,6 +466,11 @@ public:
   // null.  It may be called multiple times.
   static void createTheTimeInfo();
 
+  // print - Prints out timing information and then resets the timers.
+  void print() {
+    TG.print(*CreateInfoOutputFile());
+  }
+
   /// getPassTimer - Return the timer for the specified pass if it exists.
   Timer *getPassTimer(Pass *P) {
     if (P->getAsPMDataManager())
@@ -1752,6 +1758,13 @@ Timer *llvm::getPassTimer(Pass *P) {
   return nullptr;
 }
 
+/// If timing is enabled, report the times collected up to now and then reset
+/// them.
+void llvm::reportAndResetTimings() {
+  if (TheTimeInfo)
+    TheTimeInfo->print();
+}
+
 //===----------------------------------------------------------------------===//
 // PMStack implementation
 //
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index fec9df193685..12c258d95f52 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -1,4 +1,4 @@
-//===-- Module.cpp - Implement the Module class ---------------------------===//
+//===- Module.cpp - Implement the Module class ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,27 +11,46 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Module.h"
 #include "SymbolTableListTraitsImpl.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/GVMaterializer.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/RandomNumberGenerator.h"
 #include <algorithm>
-#include <cstdarg>
-#include <cstdlib>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index b67b0a307861..c9f957c244f8 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -1,4 +1,4 @@
-//===-- Type.cpp - Implement the Type class -------------------------------===//
+//===- Type.cpp - Implement the Type class --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,25 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Type.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include <algorithm>
-#include <cstdarg>
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -220,7 +233,6 @@ PointerType *Type::getInt64PtrTy(LLVMContext &C, unsigned AS) {
   return getInt64Ty(C)->getPointerTo(AS);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                       IntegerType Implementation
 //===----------------------------------------------------------------------===//
@@ -362,7 +374,8 @@ void StructType::setName(StringRef Name) {
   if (Name == getName()) return;
 
   StringMap<StructType *> &SymbolTable = getContext().pImpl->NamedStructTypes;
-  typedef StringMap<StructType *>::MapEntryTy EntryTy;
+
+  using EntryTy = StringMap<StructType *>::MapEntryTy;
 
   // If this struct already had a name, remove its symbol table entry. Don't
   // delete the data yet because it may be part of the new name.
@@ -419,21 +432,6 @@ StructType *StructType::get(LLVMContext &Context, bool isPacked) {
   return get(Context, None, isPacked);
 }
 
-StructType *StructType::get(Type *type, ...) {
-  assert(type && "Cannot create a struct type with no elements with this");
-  LLVMContext &Ctx = type->getContext();
-  va_list ap;
-  SmallVector<llvm::Type*, 8> StructFields;
-  va_start(ap, type);
-  while (type) {
-    StructFields.push_back(type);
-    type = va_arg(ap, llvm::Type*);
-  }
-  auto *Ret = llvm::StructType::get(Ctx, StructFields);
-  va_end(ap);
-  return Ret;
-}
-
 StructType *StructType::create(LLVMContext &Context, ArrayRef<Type*> Elements,
                                StringRef Name, bool isPacked) {
   StructType *ST = create(Context, Name);
@@ -462,21 +460,6 @@ StructType *StructType::create(ArrayRef<Type*> Elements) {
   return create(Elements[0]->getContext(), Elements, StringRef());
 }
 
-StructType *StructType::create(StringRef Name, Type *type, ...) {
-  assert(type && "Cannot create a struct type with no elements with this");
-  LLVMContext &Ctx = type->getContext();
-  va_list ap;
-  SmallVector<llvm::Type*, 8> StructFields;
-  va_start(ap, type);
-  while (type) {
-    StructFields.push_back(type);
-    type = va_arg(ap, llvm::Type*);
-  }
-  auto *Ret = llvm::StructType::create(Ctx, StructFields, Name);
-  va_end(ap);
-  return Ret;
-}
-
 bool StructType::isSized(SmallPtrSetImpl<Type*> *Visited) const {
   if ((getSubclassData() & SCDB_IsSized) != 0)
     return true;
@@ -508,19 +491,6 @@ StringRef StructType::getName() const {
   return ((StringMapEntry<StructType*> *)SymbolTableEntry)->getKey();
 }
 
-void StructType::setBody(Type *type, ...) {
-  assert(type && "Cannot create a struct type with no elements with this");
-  va_list ap;
-  SmallVector<llvm::Type*, 8> StructFields;
-  va_start(ap, type);
-  while (type) {
-    StructFields.push_back(type);
-    type = va_arg(ap, llvm::Type*);
-  }
-  setBody(StructFields);
-  va_end(ap);
-}
-
 bool StructType::isValidElementType(Type *ElemTy) {
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
          !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
@@ -540,7 +510,6 @@ StructType *Module::getTypeByName(StringRef Name) const {
   return getContext().pImpl->NamedStructTypes.lookup(Name);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                       CompositeType Implementation
 //===----------------------------------------------------------------------===//
@@ -589,7 +558,6 @@ bool CompositeType::indexValid(unsigned Idx) const {
   return true;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                           ArrayType Implementation
 //===----------------------------------------------------------------------===//
@@ -661,7 +629,6 @@ PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) {
   return Entry;
 }
 
-
 PointerType::PointerType(Type *E, unsigned AddrSpace)
   : Type(E->getContext(), PointerTyID), PointeeTy(E) {
   ContainedTys = &PointeeTy;
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 65e124562493..3b68d6365872 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -267,6 +267,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// \brief Keep track of the metadata nodes that have been checked already.
   SmallPtrSet<const Metadata *, 32> MDNodes;
 
+  /// Keep track which DISubprogram is attached to which function.
+  DenseMap<const DISubprogram *, const Function *> DISubprogramAttachments;
+
   /// Track all DICompileUnits visited.
   SmallPtrSet<const Metadata *, 2> CUVisited;
 
@@ -386,7 +389,7 @@ public:
     verifyCompileUnits();
 
     verifyDeoptimizeCallingConvs();
-
+    DISubprogramAttachments.clear();
     return !Broken;
   }
 
@@ -2085,13 +2088,19 @@ void Verifier::visitFunction(const Function &F) {
       switch (I.first) {
       default:
         break;
-      case LLVMContext::MD_dbg:
+      case LLVMContext::MD_dbg: {
         ++NumDebugAttachments;
         AssertDI(NumDebugAttachments == 1,
                  "function must have a single !dbg attachment", &F, I.second);
         AssertDI(isa<DISubprogram>(I.second),
                  "function !dbg attachment must be a subprogram", &F, I.second);
+        auto *SP = cast<DISubprogram>(I.second);
+        const Function *&AttachedTo = DISubprogramAttachments[SP];
+        AssertDI(!AttachedTo || AttachedTo == &F,
+                 "DISubprogram attached to more than one function", SP, &F);
+        AttachedTo = &F;
         break;
+      }
       case LLVMContext::MD_prof:
         ++NumProfAttachments;
         Assert(NumProfAttachments == 1,
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index 684b378c93e5..89ddd0fc1af3 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -24,7 +24,6 @@ subdirectories =
  DebugInfo
  Demangle
  ExecutionEngine
- LibDriver
  LineEditor
  Linker
  IR
@@ -39,6 +38,7 @@ subdirectories =
  Support
  TableGen
  Target
+ ToolDrivers
  Transforms
 
 [component_0]
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 2d2dcdec05fb..c73b6b6b15c1 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -973,7 +973,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
           // this value. If not, no need to preserve any ThinLTO copies.
           !Res.second.IRName.empty())
         GUIDPreservedSymbols.insert(GlobalValue::getGUID(
-            GlobalValue::getRealLinkageName(Res.second.IRName)));
+            GlobalValue::dropLLVMManglingEscape(Res.second.IRName)));
     }
 
     auto DeadSymbols =
@@ -993,7 +993,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
       if (Res.second.IRName.empty())
         continue;
       auto GUID = GlobalValue::getGUID(
-          GlobalValue::getRealLinkageName(Res.second.IRName));
+          GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
       // Mark exported unless index-based analysis determined it to be dead.
       if (!DeadSymbols.count(GUID))
         ExportedGUIDs.insert(GUID);
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 86fba843e980..6a275560dc92 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -495,17 +495,14 @@ void LTOCodeGenerator::verifyMergedModuleOnce() {
     return;
   HasVerifiedInput = true;
 
-  if (LTOStripInvalidDebugInfo) {
-    bool BrokenDebugInfo = false;
-    if (verifyModule(*MergedModule, &dbgs(), &BrokenDebugInfo))
-      report_fatal_error("Broken module found, compilation aborted!");
-    if (BrokenDebugInfo) {
-      emitWarning("Invalid debug info found, debug info will be stripped");
-      StripDebugInfo(*MergedModule);
-    }
-  }
-  if (verifyModule(*MergedModule, &dbgs()))
+  bool BrokenDebugInfo = false;
+  if (verifyModule(*MergedModule, &dbgs(),
+                   LTOStripInvalidDebugInfo ? &BrokenDebugInfo : nullptr))
     report_fatal_error("Broken module found, compilation aborted!");
+  if (BrokenDebugInfo) {
+    emitWarning("Invalid debug info found, debug info will be stripped");
+    StripDebugInfo(*MergedModule);
+  }
 }
 
 void LTOCodeGenerator::finishOptimizationRemarks() {
@@ -600,6 +597,7 @@ bool LTOCodeGenerator::compileOptimized(ArrayRef<raw_pwrite_stream *> Out) {
   // If statistics were requested, print them out after codegen.
   if (llvm::AreStatisticsEnabled())
     llvm::PrintStatistics();
+  reportAndResetTimings();
 
   finishOptimizationRemarks();
 
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index b4ee7c2b2fbc..65a7994325bc 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -446,7 +446,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
     {
       raw_svector_ostream OS(OutputBuffer);
       ProfileSummaryInfo PSI(TheModule);
-      auto Index = buildModuleSummaryIndex(TheModule, nullptr, nullptr);
+      auto Index = buildModuleSummaryIndex(TheModule, nullptr, &PSI);
       WriteBitcodeToFile(&TheModule, OS, true, &Index);
     }
     return make_unique<ObjectMemoryBuffer>(std::move(OutputBuffer));
@@ -1024,4 +1024,5 @@ void ThinLTOCodeGenerator::run() {
   // If statistics were requested, print them out now.
   if (llvm::AreStatisticsEnabled())
     llvm::PrintStatistics();
+  reportAndResetTimings();
 }
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index 15a46a2d0420..ecef1efda1a2 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -602,6 +602,7 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
                          /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
                          SGVar->getType()->getAddressSpace());
   NewDGV->setAlignment(SGVar->getAlignment());
+  NewDGV->copyAttributesFrom(SGVar);
   return NewDGV;
 }
 
@@ -610,8 +611,11 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
 Function *IRLinker::copyFunctionProto(const Function *SF) {
   // If there is no linkage to be performed or we are linking from the source,
   // bring SF over.
-  return Function::Create(TypeMap.get(SF->getFunctionType()),
-                          GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+  auto *F =
+      Function::Create(TypeMap.get(SF->getFunctionType()),
+                       GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+  F->copyAttributesFrom(SF);
+  return F;
 }
 
 /// Set up prototypes for any aliases that come over from the source module.
@@ -619,9 +623,11 @@ GlobalValue *IRLinker::copyGlobalAliasProto(const GlobalAlias *SGA) {
   // If there is no linkage to be performed or we're linking from the source,
   // bring over SGA.
   auto *Ty = TypeMap.get(SGA->getValueType());
-  return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
-                             GlobalValue::ExternalLinkage, SGA->getName(),
-                             &DstM);
+  auto *GA =
+      GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
+                          GlobalValue::ExternalLinkage, SGA->getName(), &DstM);
+  GA->copyAttributesFrom(SGA);
+  return GA;
 }
 
 GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
@@ -648,8 +654,6 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
   else if (SGV->hasExternalWeakLinkage())
     NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
 
-  NewGV->copyAttributesFrom(SGV);
-
   if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
     // Metadata for global variables and function declarations is copied eagerly.
     if (isa<GlobalVariable>(SGV) || SGV->isDeclaration())
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index f7f2253256eb..174397e27396 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -133,6 +133,11 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
   // Avoid fixups when possible.
   int64_t AbsValue;
   if (Value->evaluateAsAbsolute(AbsValue, getAssembler())) {
+    if (!isUIntN(8 * Size, AbsValue) && !isIntN(8 * Size, AbsValue)) {
+      getContext().reportError(
+          Loc, "value evaluated as " + Twine(AbsValue) + " is out of range.");
+      return;
+    }
     EmitIntValue(AbsValue, Size);
     return;
   }
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 66ba853da2fe..3b213ef4ce09 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -288,6 +288,7 @@ public:
 
 private:
   bool isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc);
+  void altMacroString(StringRef AltMacroStr, std::string &Res);
   bool parseStatement(ParseStatementInfo &Info,
                       MCAsmParserSemaCallback *SI);
   bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
@@ -1209,6 +1210,8 @@ bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
   const char *CharPtr = StrLoc.getPointer();
   while ((*CharPtr != '>') && (*CharPtr != '\n') &&
          (*CharPtr != '\r') && (*CharPtr != '\0')){
+	  if(*CharPtr == '!')
+		  CharPtr++;
     CharPtr++;
   }
   if (*CharPtr == '>') {
@@ -1218,6 +1221,15 @@ bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
   return false;
 }
 
+/// \brief creating a string without the escape characters '!'.
+void AsmParser::altMacroString(StringRef AltMacroStr,std::string &Res) {
+  for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
+    if (AltMacroStr[Pos] == '!')
+      Pos++;
+    Res += AltMacroStr[Pos];
+  }
+}
+
 /// \brief Parse an expression and return it.
 ///
 ///  expr ::= expr &&,|| expr               -> lowest.
@@ -2309,6 +2321,15 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
                  (*(Token.getString().begin()) == '%') && Token.is(AsmToken::Integer))
               // Emit an integer value to the buffer.
               OS << Token.getIntVal();
+            // Only Token that was validated as a string and begins with '<'
+            // is considered altMacroString!!!
+            else if ((Lexer.IsaAltMacroMode()) &&
+                     (*(Token.getString().begin()) == '<') &&
+                     Token.is(AsmToken::String)) {
+              std::string Res;
+              altMacroString(Token.getStringContents(), Res);
+              OS << Res;
+            }
             // We expect no quotes around the string's contents when
             // parsing for varargs.
             else if (Token.isNot(AsmToken::String) || VarargParameter)
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index b1223e81be43..28531feccfe1 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -1062,7 +1062,7 @@ COFFObjectFile::getSectionContents(const coff_section *Sec,
   // In COFF, a virtual section won't have any in-file
   // content, so the file pointer to the content will be zero.
   if (Sec->PointerToRawData == 0)
-    return object_error::parse_failed;
+    return std::error_code();
   // The only thing that we need to verify is that the contents is contained
   // within the file bounds. We don't need to make sure it doesn't cover other
   // data, as there's nothing that says that is not allowed.
@@ -1602,8 +1602,6 @@ ErrorOr<ArrayRef<UTF16>> ResourceSectionRef::getDirStringAtOffset(uint32_t Offse
   uint16_t Length;
   RETURN_IF_ERROR(Reader.readInteger(Length));
   ArrayRef<UTF16> RawDirString;
-  // Strings are stored as 2-byte aligned unicode characters but readFixedString
-  // assumes byte string, so we double length.
   RETURN_IF_ERROR(Reader.readArray(RawDirString, Length));
   return RawDirString;
 }
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 39f8704aacf2..058686e4db9e 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -168,6 +168,13 @@ static wasm::WasmLimits readLimits(const uint8_t *&Ptr) {
   return Result;
 }
 
+static wasm::WasmTable readTable(const uint8_t *&Ptr) {
+  wasm::WasmTable Table;
+  Table.ElemType = readVarint7(Ptr);
+  Table.Limits = readLimits(Ptr);
+  return Table;
+}
+
 static Error readSection(WasmSection &Section, const uint8_t *&Ptr,
                          const uint8_t *Start) {
   // TODO(sbc): Avoid reading past EOF in the case of malformed files.
@@ -397,13 +404,22 @@ Error WasmObjectFile::parseImportSection(const uint8_t *Ptr, const uint8_t *End)
                            Sections.size(), i);
       break;
     case wasm::WASM_EXTERNAL_GLOBAL:
-      Im.GlobalType = readVarint7(Ptr);
-      Im.GlobalMutable = readVaruint1(Ptr);
+      Im.Global.Type = readVarint7(Ptr);
+      Im.Global.Mutable = readVaruint1(Ptr);
       Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT,
                            Sections.size(), i);
       break;
+    case wasm::WASM_EXTERNAL_MEMORY:
+      Im.Memory = readLimits(Ptr);
+      break;
+    case wasm::WASM_EXTERNAL_TABLE:
+      Im.Table = readTable(Ptr);
+      if (Im.Table.ElemType != wasm::WASM_TYPE_ANYFUNC) {
+        return make_error<GenericBinaryError>("Invalid table element type",
+                                              object_error::parse_failed);
+      }
+      break;
     default:
-      // TODO(sbc): Handle other kinds of imports
       return make_error<GenericBinaryError>(
           "Unexpected import kind", object_error::parse_failed);
     }
@@ -431,14 +447,11 @@ Error WasmObjectFile::parseTableSection(const uint8_t *Ptr, const uint8_t *End)
   uint32_t Count = readVaruint32(Ptr);
   Tables.reserve(Count);
   while (Count--) {
-    wasm::WasmTable Table;
-    Table.ElemType = readVarint7(Ptr);
-    if (Table.ElemType != wasm::WASM_TYPE_ANYFUNC) {
+    Tables.push_back(readTable(Ptr));
+    if (Tables.back().ElemType != wasm::WASM_TYPE_ANYFUNC) {
       return make_error<GenericBinaryError>("Invalid table element type",
                                             object_error::parse_failed);
     }
-    Table.Limits = readLimits(Ptr);
-    Tables.push_back(Table);
   }
   if (Ptr != End)
     return make_error<GenericBinaryError>("Table section ended prematurely",
@@ -493,8 +506,10 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End)
       Symbols.emplace_back(Ex.Name, WasmSymbol::SymbolType::GLOBAL_EXPORT,
                            Sections.size(), i);
       break;
+    case wasm::WASM_EXTERNAL_MEMORY:
+    case wasm::WASM_EXTERNAL_TABLE:
+      break;
     default:
-      // TODO(sbc): Handle other kinds of exports
       return make_error<GenericBinaryError>(
           "Unexpected export kind", object_error::parse_failed);
     }
@@ -507,7 +522,7 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End)
 
 Error WasmObjectFile::parseStartSection(const uint8_t *Ptr, const uint8_t *End) {
   StartFunction = readVaruint32(Ptr);
-  if (StartFunction < FunctionTypes.size())
+  if (StartFunction >= FunctionTypes.size())
     return make_error<GenericBinaryError>("Invalid start function",
                                           object_error::parse_failed);
   return Error::success();
@@ -638,10 +653,14 @@ basic_symbol_iterator WasmObjectFile::symbol_end() const {
   return BasicSymbolRef(Ref, this);
 }
 
-const WasmSymbol &WasmObjectFile::getWasmSymbol(DataRefImpl Symb) const {
+const WasmSymbol &WasmObjectFile::getWasmSymbol(const DataRefImpl &Symb) const {
   return Symbols[Symb.d.a];
 }
 
+const WasmSymbol &WasmObjectFile::getWasmSymbol(const SymbolRef &Symb) const {
+  return getWasmSymbol(Symb.getRawDataRefImpl());
+}
+
 Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
   const WasmSymbol &Sym = getWasmSymbol(Symb);
   return Sym.Name;
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
index c5d1b438ee2a..910d32f16af9 100644
--- a/lib/ObjectYAML/WasmYAML.cpp
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -265,8 +265,12 @@ void MappingTraits<WasmYAML::Import>::mapping(IO &IO,
   if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
     IO.mapRequired("SigIndex", Import.SigIndex);
   } else if (Import.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
-    IO.mapRequired("GlobalType", Import.GlobalType);
-    IO.mapRequired("GlobalMutable", Import.GlobalMutable);
+    IO.mapRequired("GlobalType", Import.GlobalImport.Type);
+    IO.mapRequired("GlobalMutable", Import.GlobalImport.Mutable);
+  } else if (Import.Kind == wasm::WASM_EXTERNAL_TABLE) {
+    IO.mapRequired("Table", Import.TableImport);
+  } else if (Import.Kind == wasm::WASM_EXTERNAL_MEMORY ) {
+    IO.mapRequired("Memory", Import.Memory);
   } else {
     llvm_unreachable("unhandled import type");
   }
diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
index b91b6fb7c7ad..b05efa7417b9 100644
--- a/lib/ProfileData/SampleProfWriter.cpp
+++ b/lib/ProfileData/SampleProfWriter.cpp
@@ -29,6 +29,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <set>
 #include <system_error>
 #include <utility>
 #include <vector>
@@ -36,6 +37,32 @@
 using namespace llvm;
 using namespace sampleprof;
 
+std::error_code
+SampleProfileWriter::write(const StringMap<FunctionSamples> &ProfileMap) {
+  if (std::error_code EC = writeHeader(ProfileMap))
+    return EC;
+
+  // Sort the ProfileMap by total samples.
+  typedef std::pair<StringRef, const FunctionSamples *> NameFunctionSamples;
+  std::vector<NameFunctionSamples> V;
+  for (const auto &I : ProfileMap)
+    V.push_back(std::make_pair(I.getKey(), &I.second));
+
+  std::stable_sort(
+      V.begin(), V.end(),
+      [](const NameFunctionSamples &A, const NameFunctionSamples &B) {
+        if (A.second->getTotalSamples() == B.second->getTotalSamples())
+          return A.first > B.first;
+        return A.second->getTotalSamples() > B.second->getTotalSamples();
+      });
+
+  for (const auto &I : V) {
+    if (std::error_code EC = write(*I.second))
+      return EC;
+  }
+  return sampleprof_error::success;
+}
+
 /// \brief Write samples to a text file.
 ///
 /// Note: it may be tempting to implement this in terms of
@@ -97,8 +124,7 @@ std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName) {
 }
 
 void SampleProfileWriterBinary::addName(StringRef FName) {
-  auto NextIdx = NameTable.size();
-  NameTable.insert(std::make_pair(FName, NextIdx));
+  NameTable.insert(std::make_pair(FName, 0));
 }
 
 void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
@@ -136,10 +162,18 @@ std::error_code SampleProfileWriterBinary::writeHeader(
     addNames(I.second);
   }
 
+  // Sort the names to make NameTable is deterministic.
+  std::set<StringRef> V;
+  for (const auto &I : NameTable)
+    V.insert(I.first);
+  int i = 0;
+  for (const StringRef &N : V)
+    NameTable[N] = i++;
+
   // Write out the name table.
   encodeULEB128(NameTable.size(), OS);
-  for (auto N : NameTable) {
-    OS << N.first;
+  for (auto N : V) {
+    OS << N;
     encodeULEB128(0, OS);
   }
   return sampleprof_error::success;
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index caa0691f9205..17144522db82 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -122,35 +122,38 @@ APInt::APInt(unsigned numbits, StringRef Str, uint8_t radix)
   fromString(numbits, Str, radix);
 }
 
+void APInt::reallocate(unsigned NewBitWidth) {
+  // If the number of words is the same we can just change the width and stop.
+  if (getNumWords() == getNumWords(NewBitWidth)) {
+    BitWidth = NewBitWidth;
+    return;
+  }
+
+  // If we have an allocation, delete it.
+  if (!isSingleWord())
+    delete [] U.pVal;
+
+  // Update BitWidth.
+  BitWidth = NewBitWidth;
+
+  // If we are supposed to have an allocation, create it.
+  if (!isSingleWord())
+    U.pVal = getMemory(getNumWords());
+}
+
 void APInt::AssignSlowCase(const APInt& RHS) {
   // Don't do anything for X = X
   if (this == &RHS)
     return;
 
-  if (BitWidth == RHS.getBitWidth()) {
-    // assume same bit-width single-word case is already handled
-    assert(!isSingleWord());
-    memcpy(U.pVal, RHS.U.pVal, getNumWords() * APINT_WORD_SIZE);
-    return;
-  }
+  // Adjust the bit width and handle allocations as necessary.
+  reallocate(RHS.getBitWidth());
 
-  if (isSingleWord()) {
-    // assume case where both are single words is already handled
-    assert(!RHS.isSingleWord());
-    U.pVal = getMemory(RHS.getNumWords());
-    memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
-  } else if (getNumWords() == RHS.getNumWords())
-    memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
-  else if (RHS.isSingleWord()) {
-    delete [] U.pVal;
+  // Copy the data.
+  if (isSingleWord())
     U.VAL = RHS.U.VAL;
-  } else {
-    delete [] U.pVal;
-    U.pVal = getMemory(RHS.getNumWords());
-    memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
-  }
-  BitWidth = RHS.BitWidth;
-  clearUnusedBits();
+  else
+    memcpy(U.pVal, RHS.U.pVal, getNumWords() * APINT_WORD_SIZE);
 }
 
 /// This method 'profiles' an APInt for use with FoldingSet.
@@ -1138,10 +1141,13 @@ APInt APInt::multiplicativeInverse(const APInt& modulo) const {
     return APInt(BitWidth, 0);
 
   // The next-to-last t is the multiplicative inverse.  However, we are
-  // interested in a positive inverse. Calcuate a positive one from a negative
+  // interested in a positive inverse. Calculate a positive one from a negative
   // one if necessary. A simple addition of the modulo suffices because
   // abs(t[i]) is known to be less than *this/2 (see the link above).
-  return t[i].isNegative() ? t[i] + modulo : t[i];
+  if (t[i].isNegative())
+    t[i] += modulo;
+
+  return std::move(t[i]);
 }
 
 /// Calculate the magic numbers required to implement a signed integer division
@@ -1240,7 +1246,7 @@ APInt::mu APInt::magicu(unsigned LeadingZeros) const {
 /// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
 /// variables here have the same names as in the algorithm. Comments explain
 /// the algorithm and any deviation from it.
-static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
+static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
                      unsigned m, unsigned n) {
   assert(u && "Must provide dividend");
   assert(v && "Must provide divisor");
@@ -1266,16 +1272,16 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
   // overflow. Note that this can require an extra word in u so that u must
   // be of length m+n+1.
   unsigned shift = countLeadingZeros(v[n-1]);
-  unsigned v_carry = 0;
-  unsigned u_carry = 0;
+  uint32_t v_carry = 0;
+  uint32_t u_carry = 0;
   if (shift) {
     for (unsigned i = 0; i < m+n; ++i) {
-      unsigned u_tmp = u[i] >> (32 - shift);
+      uint32_t u_tmp = u[i] >> (32 - shift);
       u[i] = (u[i] << shift) | u_carry;
       u_carry = u_tmp;
     }
     for (unsigned i = 0; i < n; ++i) {
-      unsigned v_tmp = v[i] >> (32 - shift);
+      uint32_t v_tmp = v[i] >> (32 - shift);
       v[i] = (v[i] << shift) | v_carry;
       v_carry = v_tmp;
     }
@@ -1296,11 +1302,11 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
     //     Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
     //     Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
     // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
-    // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
+    // qp by 1, increase rp by v[n-1], and repeat this test if rp < b. The test
     // on v[n-2] determines at high speed most of the cases in which the trial
     // value qp is one too large, and it eliminates all cases where qp is two
     // too large.
-    uint64_t dividend = ((uint64_t(u[j+n]) << 32) + u[j+n-1]);
+    uint64_t dividend = Make_64(u[j+n], u[j+n-1]);
     DEBUG(dbgs() << "KnuthDiv: dividend == " << dividend << '\n');
     uint64_t qp = dividend / v[n-1];
     uint64_t rp = dividend % v[n-1];
@@ -1323,14 +1329,14 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
     int64_t borrow = 0;
     for (unsigned i = 0; i < n; ++i) {
       uint64_t p = uint64_t(qp) * uint64_t(v[i]);
-      int64_t subres = int64_t(u[j+i]) - borrow - (unsigned)p;
-      u[j+i] = (unsigned)subres;
-      borrow = (p >> 32) - (subres >> 32);
+      int64_t subres = int64_t(u[j+i]) - borrow - Lo_32(p);
+      u[j+i] = Lo_32(subres);
+      borrow = Hi_32(p) - Hi_32(subres);
       DEBUG(dbgs() << "KnuthDiv: u[j+i] = " << u[j+i]
                    << ", borrow = " << borrow << '\n');
     }
     bool isNeg = u[j+n] < borrow;
-    u[j+n] -= (unsigned)borrow;
+    u[j+n] -= Lo_32(borrow);
 
     DEBUG(dbgs() << "KnuthDiv: after subtraction:");
     DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
@@ -1338,7 +1344,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
 
     // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
     // negative, go to step D6; otherwise go on to step D7.
-    q[j] = (unsigned)qp;
+    q[j] = Lo_32(qp);
     if (isNeg) {
       // D6. [Add back]. The probability that this step is necessary is very
       // small, on the order of only 2/b. Make sure that test data accounts for
@@ -1349,7 +1355,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
       // since it cancels with the borrow that occurred in D4.
       bool carry = false;
       for (unsigned i = 0; i < n; i++) {
-        unsigned limit = std::min(u[j+i],v[i]);
+        uint32_t limit = std::min(u[j+i],v[i]);
         u[j+i] += v[i] + carry;
         carry = u[j+i] < limit || (carry && u[j+i] == limit);
       }
@@ -1374,7 +1380,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
     // multiplication by d by using a shift left. So, all we have to do is
     // shift right here.
     if (shift) {
-      unsigned carry = 0;
+      uint32_t carry = 0;
       DEBUG(dbgs() << "KnuthDiv: remainder:");
       for (int i = n-1; i >= 0; i--) {
         r[i] = (u[i] >> shift) | carry;
@@ -1403,17 +1409,16 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   // can't use 64-bit operands here because we don't have native results of
   // 128-bits. Furthermore, casting the 64-bit values to 32-bit values won't
   // work on large-endian machines.
-  uint64_t mask = ~0ull >> (sizeof(unsigned)*CHAR_BIT);
   unsigned n = rhsWords * 2;
   unsigned m = (lhsWords * 2) - n;
 
   // Allocate space for the temporary values we need either on the stack, if
   // it will fit, or on the heap if it won't.
-  unsigned SPACE[128];
-  unsigned *U = nullptr;
-  unsigned *V = nullptr;
-  unsigned *Q = nullptr;
-  unsigned *R = nullptr;
+  uint32_t SPACE[128];
+  uint32_t *U = nullptr;
+  uint32_t *V = nullptr;
+  uint32_t *Q = nullptr;
+  uint32_t *R = nullptr;
   if ((Remainder?4:3)*n+2*m+1 <= 128) {
     U = &SPACE[0];
     V = &SPACE[m+n+1];
@@ -1421,34 +1426,34 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
     if (Remainder)
       R = &SPACE[(m+n+1) + n + (m+n)];
   } else {
-    U = new unsigned[m + n + 1];
-    V = new unsigned[n];
-    Q = new unsigned[m+n];
+    U = new uint32_t[m + n + 1];
+    V = new uint32_t[n];
+    Q = new uint32_t[m+n];
     if (Remainder)
-      R = new unsigned[n];
+      R = new uint32_t[n];
   }
 
   // Initialize the dividend
-  memset(U, 0, (m+n+1)*sizeof(unsigned));
+  memset(U, 0, (m+n+1)*sizeof(uint32_t));
   for (unsigned i = 0; i < lhsWords; ++i) {
-    uint64_t tmp = (LHS.getNumWords() == 1 ? LHS.U.VAL : LHS.U.pVal[i]);
-    U[i * 2] = (unsigned)(tmp & mask);
-    U[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+    uint64_t tmp = LHS.getRawData()[i];
+    U[i * 2] = Lo_32(tmp);
+    U[i * 2 + 1] = Hi_32(tmp);
   }
   U[m+n] = 0; // this extra word is for "spill" in the Knuth algorithm.
 
   // Initialize the divisor
-  memset(V, 0, (n)*sizeof(unsigned));
+  memset(V, 0, (n)*sizeof(uint32_t));
   for (unsigned i = 0; i < rhsWords; ++i) {
-    uint64_t tmp = (RHS.getNumWords() == 1 ? RHS.U.VAL : RHS.U.pVal[i]);
-    V[i * 2] = (unsigned)(tmp & mask);
-    V[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+    uint64_t tmp = RHS.getRawData()[i];
+    V[i * 2] = Lo_32(tmp);
+    V[i * 2 + 1] = Hi_32(tmp);
   }
 
   // initialize the quotient and remainder
-  memset(Q, 0, (m+n) * sizeof(unsigned));
+  memset(Q, 0, (m+n) * sizeof(uint32_t));
   if (Remainder)
-    memset(R, 0, n * sizeof(unsigned));
+    memset(R, 0, n * sizeof(uint32_t));
 
   // Now, adjust m and n for the Knuth division. n is the number of words in
   // the divisor. m is the number of words by which the dividend exceeds the
@@ -1469,22 +1474,22 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   // are using base 2^32 instead of base 10.
   assert(n != 0 && "Divide by zero?");
   if (n == 1) {
-    unsigned divisor = V[0];
-    unsigned remainder = 0;
-    for (int i = m+n-1; i >= 0; i--) {
-      uint64_t partial_dividend = uint64_t(remainder) << 32 | U[i];
+    uint32_t divisor = V[0];
+    uint32_t remainder = 0;
+    for (int i = m; i >= 0; i--) {
+      uint64_t partial_dividend = Make_64(remainder, U[i]);
       if (partial_dividend == 0) {
         Q[i] = 0;
         remainder = 0;
       } else if (partial_dividend < divisor) {
         Q[i] = 0;
-        remainder = (unsigned)partial_dividend;
+        remainder = Lo_32(partial_dividend);
       } else if (partial_dividend == divisor) {
         Q[i] = 1;
         remainder = 0;
       } else {
-        Q[i] = (unsigned)(partial_dividend / divisor);
-        remainder = (unsigned)(partial_dividend - (Q[i] * divisor));
+        Q[i] = Lo_32(partial_dividend / divisor);
+        remainder = Lo_32(partial_dividend - (Q[i] * divisor));
       }
     }
     if (R)
@@ -1498,24 +1503,16 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   // If the caller wants the quotient
   if (Quotient) {
     // Set up the Quotient value's memory.
-    if (Quotient->BitWidth != LHS.BitWidth) {
-      if (Quotient->isSingleWord())
-        Quotient->U.VAL = 0;
-      else
-        delete [] Quotient->U.pVal;
-      Quotient->BitWidth = LHS.BitWidth;
-      if (!Quotient->isSingleWord())
-        Quotient->U.pVal = getClearedMemory(Quotient->getNumWords());
-    } else
-      Quotient->clearAllBits();
+    Quotient->reallocate(LHS.BitWidth);
+    // Clear out any previous bits.
+    Quotient->clearAllBits();
 
     // The quotient is in Q. Reconstitute the quotient into Quotient's low
     // order words.
     // This case is currently dead as all users of divide() handle trivial cases
     // earlier.
     if (lhsWords == 1) {
-      uint64_t tmp =
-        uint64_t(Q[0]) | (uint64_t(Q[1]) << (APINT_BITS_PER_WORD / 2));
+      uint64_t tmp = Make_64(Q[1], Q[0]);
       if (Quotient->isSingleWord())
         Quotient->U.VAL = tmp;
       else
@@ -1523,30 +1520,21 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
     } else {
       assert(!Quotient->isSingleWord() && "Quotient APInt not large enough");
       for (unsigned i = 0; i < lhsWords; ++i)
-        Quotient->U.pVal[i] =
-          uint64_t(Q[i*2]) | (uint64_t(Q[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+        Quotient->U.pVal[i] = Make_64(Q[i*2+1], Q[i*2]);
     }
   }
 
   // If the caller wants the remainder
   if (Remainder) {
     // Set up the Remainder value's memory.
-    if (Remainder->BitWidth != RHS.BitWidth) {
-      if (Remainder->isSingleWord())
-        Remainder->U.VAL = 0;
-      else
-        delete [] Remainder->U.pVal;
-      Remainder->BitWidth = RHS.BitWidth;
-      if (!Remainder->isSingleWord())
-        Remainder->U.pVal = getClearedMemory(Remainder->getNumWords());
-    } else
-      Remainder->clearAllBits();
+    Remainder->reallocate(RHS.BitWidth);
+    // Clear out any previous bits.
+    Remainder->clearAllBits();
 
     // The remainder is in R. Reconstitute the remainder into Remainder's low
     // order words.
     if (rhsWords == 1) {
-      uint64_t tmp =
-        uint64_t(R[0]) | (uint64_t(R[1]) << (APINT_BITS_PER_WORD / 2));
+      uint64_t tmp = Make_64(R[1], R[0]);
       if (Remainder->isSingleWord())
         Remainder->U.VAL = tmp;
       else
@@ -1554,8 +1542,7 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
     } else {
       assert(!Remainder->isSingleWord() && "Remainder APInt not large enough");
       for (unsigned i = 0; i < rhsWords; ++i)
-        Remainder->U.pVal[i] =
-          uint64_t(R[i*2]) | (uint64_t(R[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+        Remainder->U.pVal[i] = Make_64(R[i*2+1], R[i*2]);
     }
   }
 
@@ -1578,29 +1565,30 @@ APInt APInt::udiv(const APInt& RHS) const {
   }
 
   // Get some facts about the LHS and RHS number of bits and words
-  unsigned rhsBits = RHS.getActiveBits();
-  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  unsigned lhsWords = getNumWords(getActiveBits());
+  unsigned rhsBits  = RHS.getActiveBits();
+  unsigned rhsWords = getNumWords(rhsBits);
   assert(rhsWords && "Divided by zero???");
-  unsigned lhsBits = this->getActiveBits();
-  unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
 
   // Deal with some degenerate cases
   if (!lhsWords)
     // 0 / X ===> 0
     return APInt(BitWidth, 0);
-  else if (lhsWords < rhsWords || this->ult(RHS)) {
+  if (rhsBits == 1)
+    // X / 1 ===> X
+    return *this;
+  if (lhsWords < rhsWords || this->ult(RHS))
     // X / Y ===> 0, iff X < Y
     return APInt(BitWidth, 0);
-  } else if (*this == RHS) {
+  if (*this == RHS)
     // X / X ===> 1
     return APInt(BitWidth, 1);
-  } else if (lhsWords == 1 && rhsWords == 1) {
+  if (lhsWords == 1) // rhsWords is 1 if lhsWords is 1.
     // All high words are zero, just use native divide
     return APInt(BitWidth, this->U.pVal[0] / RHS.U.pVal[0]);
-  }
 
   // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-  APInt Quotient(1,0); // to hold result.
+  APInt Quotient; // to hold result.
   divide(*this, lhsWords, RHS, rhsWords, &Quotient, nullptr);
   return Quotient;
 }
@@ -1624,31 +1612,32 @@ APInt APInt::urem(const APInt& RHS) const {
   }
 
   // Get some facts about the LHS
-  unsigned lhsBits = getActiveBits();
-  unsigned lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+  unsigned lhsWords = getNumWords(getActiveBits());
 
   // Get some facts about the RHS
   unsigned rhsBits = RHS.getActiveBits();
-  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  unsigned rhsWords = getNumWords(rhsBits);
   assert(rhsWords && "Performing remainder operation by zero ???");
 
   // Check the degenerate cases
-  if (lhsWords == 0) {
+  if (lhsWords == 0)
     // 0 % Y ===> 0
     return APInt(BitWidth, 0);
-  } else if (lhsWords < rhsWords || this->ult(RHS)) {
+  if (rhsBits == 1)
+    // X % 1 ===> 0
+    return APInt(BitWidth, 0);
+  if (lhsWords < rhsWords || this->ult(RHS))
     // X % Y ===> X, iff X < Y
     return *this;
-  } else if (*this == RHS) {
+  if (*this == RHS)
     // X % X == 0;
     return APInt(BitWidth, 0);
-  } else if (lhsWords == 1) {
+  if (lhsWords == 1)
     // All high words are zero, just use native remainder
     return APInt(BitWidth, U.pVal[0] % RHS.U.pVal[0]);
-  }
 
   // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-  APInt Remainder(1,0);
+  APInt Remainder;
   divide(*this, lhsWords, RHS, rhsWords, nullptr, &Remainder);
   return Remainder;
 }
@@ -1667,22 +1656,23 @@ APInt APInt::srem(const APInt &RHS) const {
 void APInt::udivrem(const APInt &LHS, const APInt &RHS,
                     APInt &Quotient, APInt &Remainder) {
   assert(LHS.BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  unsigned BitWidth = LHS.BitWidth;
 
   // First, deal with the easy case
   if (LHS.isSingleWord()) {
     assert(RHS.U.VAL != 0 && "Divide by zero?");
     uint64_t QuotVal = LHS.U.VAL / RHS.U.VAL;
     uint64_t RemVal = LHS.U.VAL % RHS.U.VAL;
-    Quotient = APInt(LHS.BitWidth, QuotVal);
-    Remainder = APInt(LHS.BitWidth, RemVal);
+    Quotient = APInt(BitWidth, QuotVal);
+    Remainder = APInt(BitWidth, RemVal);
     return;
   }
 
   // Get some size facts about the dividend and divisor
-  unsigned lhsBits  = LHS.getActiveBits();
-  unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
+  unsigned lhsWords = getNumWords(LHS.getActiveBits());
   unsigned rhsBits  = RHS.getActiveBits();
-  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  unsigned rhsWords = getNumWords(rhsBits);
+  assert(rhsWords && "Performing divrem operation by zero ???");
 
   // Check the degenerate cases
   if (lhsWords == 0) {
@@ -1691,6 +1681,11 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
     return;
   }
 
+  if (rhsBits == 1) {
+    Quotient = LHS;             // X / 1 ===> X
+    Remainder = 0;              // X % 1 ===> 0
+  }
+
   if (lhsWords < rhsWords || LHS.ult(RHS)) {
     Remainder = LHS;            // X % Y ===> X, iff X < Y
     Quotient = 0;               // X / Y ===> 0, iff X < Y
@@ -1703,12 +1698,15 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
     return;
   }
 
-  if (lhsWords == 1 && rhsWords == 1) {
+  if (lhsWords == 1) { // rhsWords is 1 if lhsWords is 1.
     // There is only one word to consider so use the native versions.
-    uint64_t lhsValue = LHS.isSingleWord() ? LHS.U.VAL : LHS.U.pVal[0];
-    uint64_t rhsValue = RHS.isSingleWord() ? RHS.U.VAL : RHS.U.pVal[0];
-    Quotient = APInt(LHS.getBitWidth(), lhsValue / rhsValue);
-    Remainder = APInt(LHS.getBitWidth(), lhsValue % rhsValue);
+    uint64_t lhsValue = LHS.U.pVal[0];
+    uint64_t rhsValue = RHS.U.pVal[0];
+    // Make sure there is enough space to hold the results.
+    Quotient.reallocate(BitWidth);
+    Remainder.reallocate(BitWidth);
+    Quotient = lhsValue / rhsValue;
+    Remainder = lhsValue % rhsValue;
     return;
   }
 
@@ -1723,12 +1721,12 @@ void APInt::sdivrem(const APInt &LHS, const APInt &RHS,
       APInt::udivrem(-LHS, -RHS, Quotient, Remainder);
     else {
       APInt::udivrem(-LHS, RHS, Quotient, Remainder);
-      Quotient = -Quotient;
+      Quotient.negate();
     }
-    Remainder = -Remainder;
+    Remainder.negate();
   } else if (RHS.isNegative()) {
     APInt::udivrem(LHS, -RHS, Quotient, Remainder);
-    Quotient = -Quotient;
+    Quotient.negate();
   } else {
     APInt::udivrem(LHS, RHS, Quotient, Remainder);
   }
@@ -1859,10 +1857,8 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
     *this += digit;
   }
   // If its negative, put it in two's complement form
-  if (isNeg) {
-    --(*this);
-    this->flipAllBits();
-  }
+  if (isNeg)
+    this->negate();
 }
 
 void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
@@ -1940,8 +1936,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
     // They want to print the signed version and it is a negative value
     // Flip the bits and add one to turn it into the equivalent positive
     // value and put a '-' in the result.
-    Tmp.flipAllBits();
-    ++Tmp;
+    Tmp.negate();
     Str.push_back('-');
   }
 
@@ -1961,22 +1956,19 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
     unsigned ShiftAmt = (Radix == 16 ? 4 : (Radix == 8 ? 3 : 1));
     unsigned MaskAmt = Radix - 1;
 
-    while (Tmp != 0) {
+    while (Tmp.getBoolValue()) {
       unsigned Digit = unsigned(Tmp.getRawData()[0]) & MaskAmt;
       Str.push_back(Digits[Digit]);
       Tmp.lshrInPlace(ShiftAmt);
     }
   } else {
-    APInt divisor(Radix == 10? 4 : 8, Radix);
-    while (Tmp != 0) {
-      APInt APdigit(1, 0);
-      APInt tmp2(Tmp.getBitWidth(), 0);
-      divide(Tmp, Tmp.getNumWords(), divisor, divisor.getNumWords(), &tmp2,
-             &APdigit);
+    APInt divisor(Tmp.getBitWidth(), Radix);
+    APInt APdigit;
+    while (Tmp.getBoolValue()) {
+      udivrem(Tmp, divisor, Tmp, APdigit);
       unsigned Digit = (unsigned)APdigit.getZExtValue();
       assert(Digit < Radix && "divide failed");
       Str.push_back(Digits[Digit]);
-      Tmp = tmp2;
     }
   }
 
@@ -2346,13 +2338,11 @@ int APInt::tcMultiply(WordType *dst, const WordType *lhs,
   return overflow;
 }
 
-/* DST = LHS * RHS, where DST has width the sum of the widths of the
-   operands.  No overflow occurs.  DST must be disjoint from both
-   operands.  Returns the number of parts required to hold the
-   result.  */
-unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
-                               const WordType *rhs, unsigned lhsParts,
-                               unsigned rhsParts) {
+/// DST = LHS * RHS, where DST has width the sum of the widths of the
+/// operands. No overflow occurs. DST must be disjoint from both operands.
+void APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
+                           const WordType *rhs, unsigned lhsParts,
+                           unsigned rhsParts) {
   /* Put the narrower number on the LHS for less loops below.  */
   if (lhsParts > rhsParts)
     return tcFullMultiply (dst, rhs, lhs, rhsParts, lhsParts);
@@ -2363,10 +2353,6 @@ unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
 
   for (unsigned i = 0; i < lhsParts; i++)
     tcMultiplyPart(&dst[i], rhs, lhs[i], 0, rhsParts, rhsParts + 1, true);
-
-  unsigned n = lhsParts + rhsParts;
-
-  return n - (dst[n - 1] == 0);
 }
 
 /* If RHS is zero LHS and REMAINDER are left unchanged, return one.
@@ -2400,22 +2386,20 @@ int APInt::tcDivide(WordType *lhs, const WordType *rhs,
   /* Loop, subtracting SRHS if REMAINDER is greater and adding that to
      the total.  */
   for (;;) {
-      int compare;
-
-      compare = tcCompare(remainder, srhs, parts);
-      if (compare >= 0) {
-        tcSubtract(remainder, srhs, 0, parts);
-        lhs[n] |= mask;
-      }
+    int compare = tcCompare(remainder, srhs, parts);
+    if (compare >= 0) {
+      tcSubtract(remainder, srhs, 0, parts);
+      lhs[n] |= mask;
+    }
 
-      if (shiftCount == 0)
-        break;
-      shiftCount--;
-      tcShiftRight(srhs, parts, 1);
-      if ((mask >>= 1) == 0) {
-        mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1);
-        n--;
-      }
+    if (shiftCount == 0)
+      break;
+    shiftCount--;
+    tcShiftRight(srhs, parts, 1);
+    if ((mask >>= 1) == 0) {
+      mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1);
+      n--;
+    }
   }
 
   return false;
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 63c440037c22..83376284548f 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -81,6 +81,7 @@ add_llvm_library(LLVMSupport
   MD5.cpp
   NativeFormatting.cpp
   Options.cpp
+  Parallel.cpp
   PluginLoader.cpp
   PrettyStackTrace.cpp
   RandomNumberGenerator.cpp
diff --git a/lib/Support/Parallel.cpp b/lib/Support/Parallel.cpp
new file mode 100644
index 000000000000..ab2cfdebf07d
--- /dev/null
+++ b/lib/Support/Parallel.cpp
@@ -0,0 +1,138 @@
+//===- llvm/Support/Parallel.cpp - Parallel algorithms --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Parallel.h"
+#include "llvm/Config/llvm-config.h"
+
+#include <atomic>
+#include <stack>
+#include <thread>
+
+using namespace llvm;
+
+namespace {
+
+/// \brief An abstract class that takes closures and runs them asynchronously.
+class Executor {
+public:
+  virtual ~Executor() = default;
+  virtual void add(std::function<void()> func) = 0;
+
+  static Executor *getDefaultExecutor();
+};
+
+#if !LLVM_ENABLE_THREADS
+class SyncExecutor : public Executor {
+public:
+  virtual void add(std::function<void()> F) { F(); }
+};
+
+Executor *Executor::getDefaultExecutor() {
+  static SyncExecutor Exec;
+  return &Exec;
+}
+
+#elif defined(_MSC_VER)
+/// \brief An Executor that runs tasks via ConcRT.
+class ConcRTExecutor : public Executor {
+  struct Taskish {
+    Taskish(std::function<void()> Task) : Task(Task) {}
+
+    std::function<void()> Task;
+
+    static void run(void *P) {
+      Taskish *Self = static_cast<Taskish *>(P);
+      Self->Task();
+      concurrency::Free(Self);
+    }
+  };
+
+public:
+  virtual void add(std::function<void()> F) {
+    Concurrency::CurrentScheduler::ScheduleTask(
+        Taskish::run, new (concurrency::Alloc(sizeof(Taskish))) Taskish(F));
+  }
+};
+
+Executor *Executor::getDefaultExecutor() {
+  static ConcRTExecutor exec;
+  return &exec;
+}
+
+#else
+/// \brief An implementation of an Executor that runs closures on a thread pool
+///   in filo order.
+class ThreadPoolExecutor : public Executor {
+public:
+  explicit ThreadPoolExecutor(
+      unsigned ThreadCount = std::thread::hardware_concurrency())
+      : Done(ThreadCount) {
+    // Spawn all but one of the threads in another thread as spawning threads
+    // can take a while.
+    std::thread([&, ThreadCount] {
+      for (size_t i = 1; i < ThreadCount; ++i) {
+        std::thread([=] { work(); }).detach();
+      }
+      work();
+    }).detach();
+  }
+
+  ~ThreadPoolExecutor() override {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    Stop = true;
+    Lock.unlock();
+    Cond.notify_all();
+    // Wait for ~Latch.
+  }
+
+  void add(std::function<void()> F) override {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    WorkStack.push(F);
+    Lock.unlock();
+    Cond.notify_one();
+  }
+
+private:
+  void work() {
+    while (true) {
+      std::unique_lock<std::mutex> Lock(Mutex);
+      Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+      if (Stop)
+        break;
+      auto Task = WorkStack.top();
+      WorkStack.pop();
+      Lock.unlock();
+      Task();
+    }
+    Done.dec();
+  }
+
+  std::atomic<bool> Stop{false};
+  std::stack<std::function<void()>> WorkStack;
+  std::mutex Mutex;
+  std::condition_variable Cond;
+  parallel::detail::Latch Done;
+};
+
+Executor *Executor::getDefaultExecutor() {
+  static ThreadPoolExecutor exec;
+  return &exec;
+}
+#endif
+}
+
+#if LLVM_ENABLE_THREADS
+void parallel::detail::TaskGroup::spawn(std::function<void()> F) {
+  L.inc();
+  Executor::getDefaultExecutor()->add([&, F] {
+    F();
+    L.dec();
+  });
+}
+#endif
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index fa28ba1b6ab6..cdea09be41e0 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -103,13 +103,16 @@
 #define STATVFS_F_FLAG(vfs) (vfs).f_flags
 #endif
 
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+#include <sys/sysctl.h>
+#endif
+
 using namespace llvm;
 
 namespace llvm {
 namespace sys  {
 namespace fs {
-#if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
-    defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \
+#if defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \
     defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__) || \
     defined(_AIX)
 static int
@@ -164,7 +167,7 @@ getprogpath(char ret[PATH_MAX], const char *bin)
   free(pv);
   return nullptr;
 }
-#endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__
+#endif // Bitrig || OpenBSD || minix || linux || CYGWIN || DragonFly || AIX
 
 /// GetMainExecutable - Return the path to the main executable, given the
 /// value of argv[0] from program startup.
@@ -180,9 +183,24 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
     if (realpath(exe_path, link_path))
       return link_path;
   }
-#elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
-      defined(__OpenBSD__) || defined(__minix) || defined(__DragonFly__) || \
-      defined(__FreeBSD_kernel__) || defined(_AIX)
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__)
+  int mib[4];
+  mib[0] = CTL_KERN;
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+  mib[1] = KERN_PROC;
+  mib[2] = KERN_PROC_PATHNAME;
+  mib[3] = -1;
+#else
+  mib[1] = KERN_PROC_ARGS;
+  mib[2] = -1;
+  mib[3] = KERN_PROC_PATHNAME;
+#endif
+  char exe_path[PATH_MAX];
+  size_t cb = sizeof(exe_path);
+  if (sysctl(mib, 4, exe_path, &cb, NULL, 0) == 0)
+    return exe_path;
+#elif defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \
+      defined(__DragonFly__) || defined(_AIX)
   char exe_path[PATH_MAX];
 
   if (getprogpath(exe_path, argv0) != NULL)
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 16f8f5a98e52..1d0143c6716e 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -347,7 +347,7 @@ static bool terminalHasColors(int fd) {
   MutexGuard G(*TermColorMutex);
 
   int errret = 0;
-  if (setupterm((char *)nullptr, fd, &errret) != 0)
+  if (setupterm(nullptr, fd, &errret) != 0)
     // Regardless of why, if we can't get terminfo, we shouldn't try to print
     // colors.
     return false;
@@ -369,7 +369,7 @@ static bool terminalHasColors(int fd) {
 
   // Now extract the structure allocated by setupterm and free its memory
   // through a really silly dance.
-  struct term *termp = set_curterm((struct term *)nullptr);
+  struct term *termp = set_curterm(nullptr);
   (void)del_curterm(termp); // Drop any errors here.
 
   // Return true if we found a color capabilities for the current terminal.
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 73f2b6a25f66..4af5fef4287c 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -216,6 +216,7 @@ def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    FeatureCRC,
                                    FeatureCrypto,
                                    FeatureFPARMv8,
+                                   FeatureFuseAES,
                                    FeatureNEON,
                                    FeaturePerfMon
                                    ]>;
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index ff3e4c40e2c2..29f6d571d6bd 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -380,7 +380,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets);
   }
 
-  CallSeqStart.addImm(Handler.StackSize);
+  CallSeqStart.addImm(Handler.StackSize).addImm(0);
   MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP)
       .addImm(Handler.StackSize)
       .addImm(0);
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 083708001757..9ac7ecb9cdb4 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3014,7 +3014,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-    .addImm(NumBytes);
+    .addImm(NumBytes).addImm(0);
 
   // Process the args.
   for (CCValAssign &VA : ArgLocs) {
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4b1bb27dce73..4f7c2e122390 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2265,7 +2265,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
 
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+  StructType *RetTy = StructType::get(ArgTy, ArgTy);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
@@ -3249,9 +3249,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL,
-                                                              true),
-                                 DL);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
                                         getPointerTy(DAG.getDataLayout()));
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index cb268828455e..c42738da7ab0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3427,6 +3427,10 @@ static bool getFMAPatterns(MachineInstr &Root,
       Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
       Found = true;
     }
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
+      Found = true;
+    }
     break;
   case AArch64::FSUBDrr:
     if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
@@ -3441,6 +3445,10 @@ static bool getFMAPatterns(MachineInstr &Root,
       Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
       Found = true;
     }
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
+      Found = true;
+    }
     break;
   case AArch64::FSUBv2f32:
     if (canCombineWithFMUL(MBB, Root.getOperand(2),
@@ -3495,6 +3503,8 @@ AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
   case MachineCombinerPattern::FMULADDD_OP2:
   case MachineCombinerPattern::FMULSUBD_OP1:
   case MachineCombinerPattern::FMULSUBD_OP2:
+  case MachineCombinerPattern::FNMULSUBS_OP1:
+  case MachineCombinerPattern::FNMULSUBD_OP1:
   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
@@ -3996,6 +4006,24 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
     break;
   }
+
+  case MachineCombinerPattern::FNMULSUBS_OP1:
+  case MachineCombinerPattern::FNMULSUBD_OP1: {
+    // FNMUL I=A,B,0
+    // FSUB R,I,C
+    // ==> FNMADD R,A,B,C // = -A*B - C
+    // --- Create(FNMADD);
+    if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
+      Opc = AArch64::FNMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FNMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  }
+
   case MachineCombinerPattern::FMULSUBS_OP2:
   case MachineCombinerPattern::FMULSUBD_OP2: {
     // FMUL I=A,B,0
@@ -4011,6 +4039,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
+  }
 
   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
     Opc = AArch64::FMLSv1i32_indexed;
@@ -4067,7 +4096,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
                              FMAInstKind::Accumulator);
     }
     break;
-  }
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
   DelInstrs.push_back(MUL);
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 902b08844216..5ddf66654a67 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -156,7 +156,8 @@ def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
 def AArch64addlow        : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
 def AArch64LOADgot       : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
 def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
-                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+                                SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                                 SDTCisVT<1, i32> ]>,
                                 [SDNPHasChain, SDNPOutGlue]>;
 def AArch64callseq_end   : SDNode<"ISD::CALLSEQ_END",
                                 SDCallSeqEnd<[ SDTCisVT<0, i32>,
@@ -328,8 +329,9 @@ include "AArch64InstrFormats.td"
 let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
 // We set Sched to empty list because we expect these instructions to simply get
 // removed in most cases.
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              [(AArch64callseq_start timm:$amt)]>, Sched<[]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                              [(AArch64callseq_start timm:$amt1, timm:$amt2)]>,
+                              Sched<[]>;
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
                             Sched<[]>;
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 5f895903da6f..789270c2a34b 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -529,9 +529,34 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // for the greedy mode the cost of the cross bank copy will
     // offset this number.
     // FIXME: Should be derived from the scheduling model.
-    if (OpRegBankIdx[0] >= PMI_FirstFPR)
+    if (OpRegBankIdx[0] != PMI_FirstGPR)
       Cost = 2;
+    else
+      // Check if that load feeds fp instructions.
+      // In that case, we want the default mapping to be on FPR
+      // instead of blind map every scalar to GPR.
+      for (const MachineInstr &UseMI :
+           MRI.use_instructions(MI.getOperand(0).getReg()))
+        // If we have at least one direct use in a FP instruction,
+        // assume this was a floating point load in the IR.
+        // If it was not, we would have had a bitcast before
+        // reaching that instruction.
+        if (isPreISelGenericFloatingPointOpcode(UseMI.getOpcode())) {
+          OpRegBankIdx[0] = PMI_FirstFPR;
+          break;
+        }
     break;
+  case TargetOpcode::G_STORE:
+    // Check if that store is fed by fp instructions.
+    if (OpRegBankIdx[0] == PMI_FirstGPR) {
+      unsigned VReg = MI.getOperand(0).getReg();
+      if (!VReg)
+        break;
+      MachineInstr *DefMI = MRI.getVRegDef(VReg);
+      if (isPreISelGenericFloatingPointOpcode(DefMI->getOpcode()))
+        OpRegBankIdx[0] = PMI_FirstFPR;
+      break;
+    }
   }
 
   // Finally construct the computed mapping.
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 8f8eeef8a6cf..a9b4d44a523e 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -42,11 +42,11 @@ def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTXNv1i64)>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>;
 
-def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_1VXVY_5cyc],   (instrs FMULX16, FMULX32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FMULX16, FMULX32)>;
 
-def : InstRW<[FalkorWr_1VXVY_6cyc],   (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc],   (instrs FMULX64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FMULX64)>;
 
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>;
 
@@ -62,9 +62,9 @@ def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4
 def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>;
 def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>;
 
-def : InstRW<[FalkorWr_2VXVY_5cyc],   (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
 
-def : InstRW<[FalkorWr_2VXVY_6cyc],   (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
 
 def : InstRW<[FalkorWr_3VXVY_4cyc],   (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
 
@@ -72,13 +72,14 @@ def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i1
 
 def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>;
 
-def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>;
 
-def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_2VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
-def : InstRW<[FalkorWr_2VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v2i64_indexed$")>;
 // SIMD Integer Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>;
@@ -119,10 +120,10 @@ def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)ADDLVv8i8v$")>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs ADDVv8i8v)>;
-def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQDMULL(i16|i32)$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQRDML(A|S)?H(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
 
 def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>;
 
@@ -169,9 +170,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^PMULL2?(v1i64|v2i64)$")>;
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>;
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>;
 
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^SQDMULLv.*$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
 
 def : InstRW<[FalkorWr_3VXVY_3cyc],   (instregex "^(S|U)ADDLVv4i32v$")>;
 
@@ -185,8 +186,9 @@ def : InstRW<[FalkorWr_4VXVY_3cyc],   (instregex "^(S|U)ABALv.*$")>;
 
 def : InstRW<[FalkorWr_4VXVY_4cyc],   (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>;
 
-def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)(i16|i32)$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)v.*$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)v[248].*$")>;
+
 // SIMD Load Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[WriteVLD],                               (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
@@ -294,9 +296,9 @@ def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPXv1i32, FRECPXv1i64)>;
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs URECPEv2i32, URSQRTEv2i32)>;
 
-def : InstRW<[FalkorWr_1VXVY_5cyc],   (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
 
-def : InstRW<[FalkorWr_1VXVY_6cyc],   (instrs FRECPS64, FRSQRTS64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>;
 
 def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_2GTOV_1cyc],   (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
@@ -311,9 +313,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs URECPEv4i32, URSQRTEv4i32)>;
 def : InstRW<[FalkorWr_2VXVY_4cyc],   (instrs TBLv8i8Two)>;
 def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^TBX(v8|v16)i8One$")>;
 
-def : InstRW<[FalkorWr_2VXVY_5cyc],   (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>;
 
-def : InstRW<[FalkorWr_2VXVY_6cyc],   (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>;
 
 def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBL(v8i8Three|v16i8Two)$")>;
 def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBX(v8i8Two|v16i8Two)$")>;
@@ -416,22 +418,25 @@ def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FCVTSHr, FCVTDHr)>;
 
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTSDr, FCVTDSr)>;
 
-def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^F(N)?MUL(H|S)rr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>;
 
-def : InstRW<[FalkorWr_1VXVY_6cyc],   (instregex "^F(N)?MULDrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>;
 
 def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>;
 def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>;
 
-def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], (instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], (instregex "^F(N)?M(ADD|SUB)Drrr$")>;
 
 // FP Miscellaneous Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_FMOV],         (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
+def : InstRW<[FalkorWr_FMOV],         (instregex "^FMOV(H|S|D)i$")>;
+def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FMOV(Hi|Hr|S0|Si|Sr|D0|Di|Dr|v.*_ns)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FMOV(Hr|Sr|Dr|v.*_ns)$")>;
+// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov 0.0
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs FMOVD0, FMOVS0)>;
 
 def : InstRW<[FalkorWr_1GTOV_4cyc],   (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>;
@@ -475,16 +480,17 @@ def : InstRW<[FalkorWr_2XYZ_2cyc],    (instregex "^EXTR(W|X)rri$")>;
 
 // Divide and Multiply Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1X_4cyc],      (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
-def : InstRW<[FalkorWr_1X_4cyc],      (instregex "^M(ADD|SUB)Wrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], (instregex "^M(ADD|SUB)Wrrr$")>;
 
-def : InstRW<[FalkorWr_1X_5cyc],      (instregex "^(S|U)MULHrr$")>;
-def : InstRW<[FalkorWr_1X_5cyc],      (instregex "^M(ADD|SUB)Xrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc],                                            (instregex "^(S|U)MULHrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^M(ADD|SUB)Xrrr$")>;
 
 def : InstRW<[FalkorWr_1X_1Z_8cyc],   (instregex "^(S|U)DIVWr$")>;
 def : InstRW<[FalkorWr_1X_1Z_16cyc],  (instregex "^(S|U)DIVXr$")>;
 
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)(MLAL|MLSL|MULL)v.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],                (instregex "^(S|U)MULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^(S|U)(MLAL|MLSL)v.*$")>;
 
 // Move and Shift Instructions
 // -----------------------------------------------------------------------------
diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
index e64b2c441a19..6526cc28e806 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
@@ -29,8 +29,9 @@
 // Define 1 micro-op types
 
 def FalkorWr_1X_2cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 2; }
-def FalkorWr_1X_4cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 4; }
-def FalkorWr_1X_5cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 5; }
+def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
 def FalkorWr_1Z_0cyc    : SchedWriteRes<[FalkorUnitZ]>   { let Latency = 0; }
 def FalkorWr_1ZB_0cyc   : SchedWriteRes<[FalkorUnitZB]>  { let Latency = 0; }
 def FalkorWr_1LD_3cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 3; }
@@ -45,8 +46,10 @@ def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
 def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
 def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
 def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
 def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
-def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
 
 def FalkorWr_1LD_0cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 0; }
 def FalkorWr_1ST_0cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 0; }
@@ -75,14 +78,26 @@ def FalkorWr_2VXVY_4cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
   let Latency = 4;
   let NumMicroOps = 2;
 }
+def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
 def FalkorWr_2VXVY_5cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
   let Latency = 5;
   let NumMicroOps = 2;
 }
+def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
 def FalkorWr_2VXVY_6cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
   let Latency = 6;
   let NumMicroOps = 2;
 }
+def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
 
 def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
   let Latency = 4;
@@ -350,18 +365,17 @@ def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
   let NumMicroOps = 9;
 }
 
-// Forwarding logic is modeled for vector multiply and accumulate
+// Forwarding logic is modeled for multiply add/accumulate.
 // -----------------------------------------------------------------------------
-def FalkorReadVMA : SchedReadAdvance<2, [FalkorWr_1VXVY_4cyc,
-                                         FalkorWr_2VXVY_4cyc]>;
-def FalkorReadFMA : SchedReadAdvance<3, [FalkorWr_1VXVY_5cyc,
-                                         FalkorWr_1VXVY_6cyc,
-                                         FalkorWr_2VXVY_5cyc,
-                                         FalkorWr_2VXVY_6cyc]>;
+def FalkorReadIMA32  : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
+def FalkorReadIMA64  : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
+def FalkorReadVMA    : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>;
+def FalkorReadFMA32  : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
+def FalkorReadFMA64  : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
 
 // SchedPredicates and WriteVariants for Immediate Zero and LSLFast
 // -----------------------------------------------------------------------------
-def FalkorImmZPred    : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
+def FalkorImmZPred    : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
 def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>; 
 
 def FalkorWr_FMOV  : SchedWriteVariant<[
@@ -378,7 +392,6 @@ def FalkorWr_LDR   : SchedWriteVariant<[
 
 def FalkorWr_ADD   : SchedWriteVariant<[
                        SchedVar<FalkorLSLFastPred, [FalkorWr_1XYZ_1cyc]>,
-                       SchedVar<FalkorImmZPred,    [FalkorWr_1XYZ_1cyc]>,
                        SchedVar<NoSchedPred,       [FalkorWr_2XYZ_2cyc]>]>;
 
 def FalkorWr_PRFM  : SchedWriteVariant<[
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index abdeac019a18..1c81d34014fd 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -91,6 +91,8 @@ void AArch64Subtarget::initializeProperties() {
   case Falkor:
     MaxInterleaveFactor = 4;
     VectorInsertExtractBaseCost = 2;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case Kryo:
     MaxInterleaveFactor = 4;
@@ -99,6 +101,8 @@ void AArch64Subtarget::initializeProperties() {
     PrefetchDistance = 740;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 11;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case ThunderX2T99:
     CacheLineSize = 64;
@@ -108,6 +112,8 @@ void AArch64Subtarget::initializeProperties() {
     PrefetchDistance = 128;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 4;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case ThunderX:
   case ThunderXT88:
@@ -116,6 +122,8 @@ void AArch64Subtarget::initializeProperties() {
     CacheLineSize = 128;
     PrefFunctionAlignment = 3;
     PrefLoopAlignment = 2;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case CortexA35: break;
   case CortexA53: break;
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 5b9bee6e41b8..df54bf3f48e1 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -83,6 +83,9 @@ protected:
   // NegativeImmediates - transform instructions with negative immediates
   bool NegativeImmediates = true;
 
+  // Enable 64-bit vectorization in SLP.
+  unsigned MinVectorRegisterBitWidth = 64;
+
   bool UseAA = false;
   bool PredictableSelectIsExpensive = false;
   bool BalanceFPOps = false;
@@ -106,6 +109,7 @@ protected:
   unsigned PrefFunctionAlignment = 0;
   unsigned PrefLoopAlignment = 0;
   unsigned MaxJumpTableSize = 0;
+  unsigned WideningBaseCost = 0;
 
   // ReserveX18 - X18 is not available as a general purpose register.
   bool ReserveX18;
@@ -190,6 +194,10 @@ public:
 
   bool isXRaySupported() const override { return true; }
 
+  unsigned getMinVectorRegisterBitWidth() const {
+    return MinVectorRegisterBitWidth;
+  }
+
   bool isX18Reserved() const { return ReserveX18; }
   bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON; }
@@ -228,6 +236,8 @@ public:
 
   unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
 
+  unsigned getWideningBaseCost() const { return WideningBaseCost; }
+
   /// CPU has TBI (top byte of addresses is ignored during HW address
   /// translation) and OS enables it.
   bool supportsAddressTopByteIgnored() const;
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 8875f9b72647..12a2e9a867f0 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -70,3 +70,11 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
   const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
   return MCBinaryExpr::createSub(Res, PC, getContext());
 }
+
+void AArch64_MachoTargetObjectFile::getNameWithPrefix(
+    SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+    const TargetMachine &TM) const {
+  // AArch64 does not use section-relative relocations so any global symbol must
+  // be accessed via at least a linker-private symbol.
+  getMangler().getNameWithPrefix(OutName, GV, /* CannotUsePrivateLabel */ true);
+}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 05e1dfa9e6c9..47e3bce43f6e 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -40,6 +40,9 @@ public:
                                           const MCValue &MV, int64_t Offset,
                                           MachineModuleInfo *MMI,
                                           MCStreamer &Streamer) const override;
+
+  void getNameWithPrefix(SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+                         const TargetMachine &TM) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4d59da0c646d..7c6f55c06bce 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -176,11 +176,95 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
+bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
+                                           ArrayRef<const Value *> Args) {
+
+  // A helper that returns a vector type from the given type. The number of
+  // elements in type Ty determine the vector width.
+  auto toVectorTy = [&](Type *ArgTy) {
+    return VectorType::get(ArgTy->getScalarType(),
+                           DstTy->getVectorNumElements());
+  };
+
+  // Exit early if DstTy is not a vector type whose elements are at least
+  // 16-bits wide.
+  if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
+    return false;
+
+  // Determine if the operation has a widening variant. We consider both the
+  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
+  // instructions.
+  //
+  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
+  //       verify that their extending operands are eliminated during code
+  //       generation.
+  switch (Opcode) {
+  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
+  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+    break;
+  default:
+    return false;
+  }
+
+  // To be a widening instruction (either the "wide" or "long" versions), the
+  // second operand must be a sign- or zero extend having a single user. We
+  // only consider extends having a single user because they may otherwise not
+  // be eliminated.
+  if (Args.size() != 2 ||
+      (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
+      !Args[1]->hasOneUse())
+    return false;
+  auto *Extend = cast<CastInst>(Args[1]);
+
+  // Legalize the destination type and ensure it can be used in a widening
+  // operation.
+  auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
+  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
+  if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
+    return false;
+
+  // Legalize the source type and ensure it can be used in a widening
+  // operation.
+  Type *SrcTy = toVectorTy(Extend->getSrcTy());
+  auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
+  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
+  if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
+    return false;
+
+  // Get the total number of vector elements in the legalized types.
+  unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
+  unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
+
+  // Return true if the legalized types have the same number of vector elements
+  // and the destination element type size is twice that of the source type.
+  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
+}
+
 int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                      const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  // If the cast is observable, and it is used by a widening instruction (e.g.,
+  // uaddl, saddw, etc.), it may be free.
+  if (I && I->hasOneUse()) {
+    auto *SingleUser = cast<Instruction>(*I->user_begin());
+    SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
+    if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
+      // If the cast is the second operand, it is free. We will generate either
+      // a "wide" or "long" version of the widening instruction.
+      if (I == SingleUser->getOperand(1))
+        return 0;
+      // If the cast is not the second operand, it will be free if it looks the
+      // same as the second operand. In this case, we will generate a "long"
+      // version of the widening instruction.
+      if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
+        if (I->getOpcode() == Cast->getOpcode() &&
+            cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
+          return 0;
+    }
+  }
+
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
@@ -379,6 +463,16 @@ int AArch64TTIImpl::getArithmeticInstrCost(
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
+  // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
+  // add in the widening overhead specified by the sub-target. Since the
+  // extends feeding widening instructions are performed automatically, they
+  // aren't present in the generated code and have a zero cost. By adding a
+  // widening overhead here, we attach the total cost of the combined operation
+  // to the widening instruction.
+  int Cost = 0;
+  if (isWideningInstruction(Ty, Opcode, Args))
+    Cost += ST->getWideningBaseCost();
+
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
   if (ISD == ISD::SDIV &&
@@ -388,9 +482,9 @@ int AArch64TTIImpl::getArithmeticInstrCost(
     // normally expanded to the sequence ADD + CMP + SELECT + SRA.
     // The OperandValue properties many not be same as that of previous
     // operation; conservatively assume OP_None.
-    int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
-                                      TargetTransformInfo::OP_None,
-                                      TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
     Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
                                    TargetTransformInfo::OP_None,
                                    TargetTransformInfo::OP_None);
@@ -405,8 +499,8 @@ int AArch64TTIImpl::getArithmeticInstrCost(
 
   switch (ISD) {
   default:
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                         Opd1PropInfo, Opd2PropInfo);
+    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                                Opd1PropInfo, Opd2PropInfo);
   case ISD::ADD:
   case ISD::MUL:
   case ISD::XOR:
@@ -414,7 +508,7 @@ int AArch64TTIImpl::getArithmeticInstrCost(
   case ISD::AND:
     // These nodes are marked as 'custom' for combining purposes only.
     // We know that they are legal. See LowerAdd in ISelLowering.
-    return 1 * LT.first;
+    return (Cost + 1) * LT.first;
   }
 }
 
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index e37c003e064c..280d97f3c502 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -43,6 +43,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     VECTOR_LDST_FOUR_ELEMENTS
   };
 
+  bool isWideningInstruction(Type *Ty, unsigned Opcode,
+                             ArrayRef<const Value *> Args);
+
 public:
   explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -84,6 +87,10 @@ public:
     return 64;
   }
 
+  unsigned getMinVectorRegisterBitWidth() {
+    return ST->getMinVectorRegisterBitWidth();
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
@@ -134,6 +141,10 @@ public:
   unsigned getMinPrefetchStride();
 
   unsigned getMaxPrefetchIterationsAhead();
+
+  bool shouldExpandReduction(const IntrinsicInst *II) const {
+    return false;
+  }
   /// @}
 };
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 4dbcc9581a84..449d732a8d44 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3904,10 +3904,14 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
   return false;
 }
 
+static SMLoc incrementLoc(SMLoc L, int Offset) {
+  return SMLoc::getFromPointer(L.getPointer() + Offset);
+}
+
 /// parseDirectiveCPU
 ///   ::= .cpu id
 bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
-  SMLoc CPULoc = getLoc();
+  SMLoc CurLoc = getLoc();
 
   StringRef CPU, ExtensionString;
   std::tie(CPU, ExtensionString) =
@@ -3923,15 +3927,19 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
   // FIXME This is using tablegen data, but should be moved to ARMTargetParser
   // once that is tablegen'ed
   if (!getSTI().isCPUStringValid(CPU)) {
-    Error(CPULoc, "unknown CPU name");
+    Error(CurLoc, "unknown CPU name");
     return false;
   }
 
   MCSubtargetInfo &STI = copySTI();
   STI.setDefaultFeatures(CPU, "");
+  CurLoc = incrementLoc(CurLoc, CPU.size());
 
   FeatureBitset Features = STI.getFeatureBits();
   for (auto Name : RequestedExtensions) {
+    // Advance source location past '+'.
+    CurLoc = incrementLoc(CurLoc, 1);
+
     bool EnableFeature = true;
 
     if (Name.startswith_lower("no")) {
@@ -3939,6 +3947,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
       Name = Name.substr(2);
     }
 
+    bool FoundExtension = false;
     for (const auto &Extension : ExtensionMap) {
       if (Extension.Name != Name)
         continue;
@@ -3952,9 +3961,15 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
       uint64_t Features =
           ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
       setAvailableFeatures(Features);
+      FoundExtension = true;
 
       break;
     }
+
+    if (!FoundExtension)
+      Error(CurLoc, "unsupported architectural extension");
+
+    CurLoc = incrementLoc(CurLoc, Name.size());
   }
   return false;
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 94112849f84e..1b28df963b40 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -32,8 +32,9 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
                clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
 
 AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+  // We prefer NEON instructions to be printed in the short, Apple-specific
+  // form when targeting Darwin.
+  AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant;
 
   PrivateGlobalPrefix = "L";
   PrivateLabelPrefix = "L";
@@ -68,8 +69,9 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
 
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+  // We prefer NEON instructions to be printed in the generic form when
+  // targeting ELF.
+  AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant;
 
   CodePointerSize = 8;
 
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 8f6e1e7d8846..3f89702bed50 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -50,6 +50,10 @@ FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+
+void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
+extern char &AMDGPUMachineCFGStructurizerID;
 
 ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 2e5b78bbf7ef..b279bd61e180 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -61,6 +61,24 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
   "Support flat address space"
 >;
 
+def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets",
+  "FlatInstOffsets",
+  "true",
+  "Flat instructions have immediate offset addressing mode"
+>;
+
+def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts",
+  "FlatGlobalInsts",
+  "true",
+  "Have global_* flat memory instructions"
+>;
+
+def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
+  "FlatScratchInsts",
+  "true",
+  "Have scratch_* flat memory instructions"
+>;
+
 def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
   "UnalignedBufferAccess",
   "true",
@@ -407,7 +425,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
    FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
-   FeatureFastFMAF32, FeatureDPP
+   FeatureFastFMAF32, FeatureDPP,
+   FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
   ]
 >;
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ccae36ced1f8..7c99752b881f 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -136,8 +136,7 @@ private:
   bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
                                    SDValue &ImmOffset, SDValue &VOffset) const;
 
-  bool SelectFlat(SDValue Addr, SDValue &VAddr,
-                  SDValue &SLC, SDValue &TFE) const;
+  bool SelectFlat(SDValue Addr, SDValue &VAddr, SDValue &SLC) const;
 
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                         bool &Imm) const;
@@ -1278,10 +1277,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
 
 bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr,
                                     SDValue &VAddr,
-                                    SDValue &SLC,
-                                    SDValue &TFE) const {
+                                    SDValue &SLC) const {
   VAddr = Addr;
-  TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+  SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
   return true;
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 915d1d9e0e68..f80652b87373 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -567,13 +567,19 @@ static bool hasSourceMods(const SDNode *N) {
   case AMDGPUISD::INTERP_P1:
   case AMDGPUISD::INTERP_P2:
   case AMDGPUISD::DIV_SCALE:
+
+  // TODO: Should really be looking at the users of the bitcast. These are
+  // problematic because bitcasts are used to legalize all stores to integer
+  // types.
+  case ISD::BITCAST:
     return false;
   default:
     return true;
   }
 }
 
-static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
+bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
+                                                 unsigned CostThreshold) {
   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
   // it is truly free to use a source modifier in all cases. If there are
   // multiple users but for each one will necessitate using VOP3, there will be
@@ -2299,7 +2305,7 @@ static bool isU24(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   DAG.computeKnownBits(Op, Known);
 
-  return (VT.getSizeInBits() - Known.Zero.countLeadingOnes()) <= 24;
+  return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
 }
 
 static bool isI24(SDValue Op, SelectionDAG &DAG) {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index e1a5a2072418..4c588a7bafd0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -132,6 +132,8 @@ public:
     return false;
   }
 
+  static bool allUsesHaveSourceMods(const SDNode *N,
+                                    unsigned CostThreshold = 4);
   bool isFAbsFree(EVT VT) const override;
   bool isFNegFree(EVT VT) const override;
   bool isTruncateFree(EVT Src, EVT Dest) const override;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8867ed689a31..a7eac080f885 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -127,9 +127,9 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
           .add(I.getOperand(1))
           .add(I.getOperand(0))
           .addImm(0)
-          .addImm(0)
           .addImm(0);
 
+
   // Now that we selected an opcode, we need to constrain the register
   // operands to use appropriate classes.
   bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
@@ -393,7 +393,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
                                .add(I.getOperand(0))
                                .addReg(PtrReg)
                                .addImm(0)
-                               .addImm(0)
                                .addImm(0);
 
   bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a2567a549028..9de302994e68 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -33,6 +33,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   const LLT P1 = LLT::pointer(1, 64);
   const LLT P2 = LLT::pointer(2, 64);
 
+  setAction({G_CONSTANT, S32}, Legal);
   setAction({G_CONSTANT, S64}, Legal);
 
   setAction({G_GEP, P1}, Legal);
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
new file mode 100644
index 000000000000..6d2785ba1c60
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -0,0 +1,2881 @@
+//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine instruction level CFG structurizer pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegionInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <tuple>
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpucfgstructurizer"
+
+namespace {
+class PHILinearizeDestIterator;
+
+class PHILinearize {
+  friend class PHILinearizeDestIterator;
+
+public:
+  typedef std::pair<unsigned, MachineBasicBlock *> PHISourceT;
+
+private:
+  typedef DenseSet<PHISourceT> PHISourcesT;
+  typedef struct {
+    unsigned DestReg;
+    DebugLoc DL;
+    PHISourcesT Sources;
+  } PHIInfoElementT;
+  typedef SmallPtrSet<PHIInfoElementT *, 2> PHIInfoT;
+  PHIInfoT PHIInfo;
+
+  static unsigned phiInfoElementGetDest(PHIInfoElementT *Info);
+  static void phiInfoElementSetDef(PHIInfoElementT *Info, unsigned NewDef);
+  static PHISourcesT &phiInfoElementGetSources(PHIInfoElementT *Info);
+  static void phiInfoElementAddSource(PHIInfoElementT *Info, unsigned SourceReg,
+                                      MachineBasicBlock *SourceMBB);
+  static void phiInfoElementRemoveSource(PHIInfoElementT *Info,
+                                         unsigned SourceReg,
+                                         MachineBasicBlock *SourceMBB);
+  PHIInfoElementT *findPHIInfoElement(unsigned DestReg);
+  PHIInfoElementT *findPHIInfoElementFromSource(unsigned SourceReg,
+                                                MachineBasicBlock *SourceMBB);
+
+public:
+  bool findSourcesFromMBB(MachineBasicBlock *SourceMBB,
+                          SmallVector<unsigned, 4> &Sources);
+  void addDest(unsigned DestReg, const DebugLoc &DL);
+  void replaceDef(unsigned OldDestReg, unsigned NewDestReg);
+  void deleteDef(unsigned DestReg);
+  void addSource(unsigned DestReg, unsigned SourceReg,
+                 MachineBasicBlock *SourceMBB);
+  void removeSource(unsigned DestReg, unsigned SourceReg,
+                    MachineBasicBlock *SourceMBB = nullptr);
+  bool findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB,
+                unsigned &DestReg);
+  bool isSource(unsigned Reg, MachineBasicBlock *SourceMBB = nullptr);
+  unsigned getNumSources(unsigned DestReg);
+  void dump(MachineRegisterInfo *MRI);
+  void clear();
+
+  typedef PHISourcesT::iterator source_iterator;
+  typedef PHILinearizeDestIterator dest_iterator;
+
+  dest_iterator dests_begin();
+  dest_iterator dests_end();
+
+  source_iterator sources_begin(unsigned Reg);
+  source_iterator sources_end(unsigned Reg);
+};
+
+class PHILinearizeDestIterator {
+private:
+  PHILinearize::PHIInfoT::iterator Iter;
+
+public:
+  unsigned operator*() { return PHILinearize::phiInfoElementGetDest(*Iter); }
+  PHILinearizeDestIterator &operator++() {
+    ++Iter;
+    return *this;
+  }
+  bool operator==(const PHILinearizeDestIterator &I) const {
+    return I.Iter == Iter;
+  }
+  bool operator!=(const PHILinearizeDestIterator &I) const {
+    return I.Iter != Iter;
+  }
+
+  PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {}
+};
+
+unsigned PHILinearize::phiInfoElementGetDest(PHIInfoElementT *Info) {
+  return Info->DestReg;
+}
+
+void PHILinearize::phiInfoElementSetDef(PHIInfoElementT *Info,
+                                        unsigned NewDef) {
+  Info->DestReg = NewDef;
+}
+
+PHILinearize::PHISourcesT &
+PHILinearize::phiInfoElementGetSources(PHIInfoElementT *Info) {
+  return Info->Sources;
+}
+
+void PHILinearize::phiInfoElementAddSource(PHIInfoElementT *Info,
+                                           unsigned SourceReg,
+                                           MachineBasicBlock *SourceMBB) {
+  // Assertion ensures we don't use the same SourceMBB for the
+  // sources, because we cannot have different registers with
+  // identical predecessors, but we can have the same register for
+  // multiple predecessors.
+#if !defined(NDEBUG)
+  for (auto SI : phiInfoElementGetSources(Info)) {
+    assert((SI.second != SourceMBB || SourceReg == SI.first));
+  }
+#endif
+
+  phiInfoElementGetSources(Info).insert(PHISourceT(SourceReg, SourceMBB));
+}
+
+void PHILinearize::phiInfoElementRemoveSource(PHIInfoElementT *Info,
+                                              unsigned SourceReg,
+                                              MachineBasicBlock *SourceMBB) {
+  auto &Sources = phiInfoElementGetSources(Info);
+  SmallVector<PHISourceT, 4> ElimiatedSources;
+  for (auto SI : Sources) {
+    if (SI.first == SourceReg &&
+        (SI.second == nullptr || SI.second == SourceMBB)) {
+      ElimiatedSources.push_back(PHISourceT(SI.first, SI.second));
+    }
+  }
+
+  for (auto &Source : ElimiatedSources) {
+    Sources.erase(Source);
+  }
+}
+
+PHILinearize::PHIInfoElementT *
+PHILinearize::findPHIInfoElement(unsigned DestReg) {
+  for (auto I : PHIInfo) {
+    if (phiInfoElementGetDest(I) == DestReg) {
+      return I;
+    }
+  }
+  return nullptr;
+}
+
+PHILinearize::PHIInfoElementT *
+PHILinearize::findPHIInfoElementFromSource(unsigned SourceReg,
+                                           MachineBasicBlock *SourceMBB) {
+  for (auto I : PHIInfo) {
+    for (auto SI : phiInfoElementGetSources(I)) {
+      if (SI.first == SourceReg &&
+          (SI.second == nullptr || SI.second == SourceMBB)) {
+        return I;
+      }
+    }
+  }
+  return nullptr;
+}
+
+bool PHILinearize::findSourcesFromMBB(MachineBasicBlock *SourceMBB,
+                                      SmallVector<unsigned, 4> &Sources) {
+  bool FoundSource = false;
+  for (auto I : PHIInfo) {
+    for (auto SI : phiInfoElementGetSources(I)) {
+      if (SI.second == SourceMBB) {
+        FoundSource = true;
+        Sources.push_back(SI.first);
+      }
+    }
+  }
+  return FoundSource;
+}
+
+void PHILinearize::addDest(unsigned DestReg, const DebugLoc &DL) {
+  assert(findPHIInfoElement(DestReg) == nullptr && "Dest already exsists");
+  PHISourcesT EmptySet;
+  PHIInfoElementT *NewElement = new PHIInfoElementT();
+  NewElement->DestReg = DestReg;
+  NewElement->DL = DL;
+  NewElement->Sources = EmptySet;
+  PHIInfo.insert(NewElement);
+}
+
+void PHILinearize::replaceDef(unsigned OldDestReg, unsigned NewDestReg) {
+  phiInfoElementSetDef(findPHIInfoElement(OldDestReg), NewDestReg);
+}
+
+void PHILinearize::deleteDef(unsigned DestReg) {
+  PHIInfoElementT *InfoElement = findPHIInfoElement(DestReg);
+  PHIInfo.erase(InfoElement);
+  delete InfoElement;
+}
+
+void PHILinearize::addSource(unsigned DestReg, unsigned SourceReg,
+                             MachineBasicBlock *SourceMBB) {
+  phiInfoElementAddSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB);
+}
+
+void PHILinearize::removeSource(unsigned DestReg, unsigned SourceReg,
+                                MachineBasicBlock *SourceMBB) {
+  phiInfoElementRemoveSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB);
+}
+
+bool PHILinearize::findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB,
+                            unsigned &DestReg) {
+  PHIInfoElementT *InfoElement =
+      findPHIInfoElementFromSource(SourceReg, SourceMBB);
+  if (InfoElement != nullptr) {
+    DestReg = phiInfoElementGetDest(InfoElement);
+    return true;
+  }
+  return false;
+}
+
+bool PHILinearize::isSource(unsigned Reg, MachineBasicBlock *SourceMBB) {
+  unsigned DestReg;
+  return findDest(Reg, SourceMBB, DestReg);
+}
+
+unsigned PHILinearize::getNumSources(unsigned DestReg) {
+  return phiInfoElementGetSources(findPHIInfoElement(DestReg)).size();
+}
+
+void PHILinearize::dump(MachineRegisterInfo *MRI) {
+  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  dbgs() << "=PHIInfo Start=\n";
+  for (auto PII : this->PHIInfo) {
+    PHIInfoElementT &Element = *PII;
+    dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI)
+           << " Sources: {";
+    for (auto &SI : Element.Sources) {
+      dbgs() << PrintReg(SI.first, TRI) << "(BB#"
+             << SI.second->getNumber() << "),";
+    }
+    dbgs() << "}\n";
+  }
+  dbgs() << "=PHIInfo End=\n";
+}
+
+void PHILinearize::clear() { PHIInfo = PHIInfoT(); }
+
+PHILinearize::dest_iterator PHILinearize::dests_begin() {
+  return PHILinearizeDestIterator(PHIInfo.begin());
+}
+
+PHILinearize::dest_iterator PHILinearize::dests_end() {
+  return PHILinearizeDestIterator(PHIInfo.end());
+}
+
+PHILinearize::source_iterator PHILinearize::sources_begin(unsigned Reg) {
+  auto InfoElement = findPHIInfoElement(Reg);
+  return phiInfoElementGetSources(InfoElement).begin();
+}
+PHILinearize::source_iterator PHILinearize::sources_end(unsigned Reg) {
+  auto InfoElement = findPHIInfoElement(Reg);
+  return phiInfoElementGetSources(InfoElement).end();
+}
+
+class RegionMRT;
+class MBBMRT;
+
+static unsigned getPHINumInputs(MachineInstr &PHI) {
+  assert(PHI.isPHI());
+  return (PHI.getNumOperands() - 1) / 2;
+}
+
+static MachineBasicBlock *getPHIPred(MachineInstr &PHI, unsigned Index) {
+  assert(PHI.isPHI());
+  return PHI.getOperand(Index * 2 + 2).getMBB();
+}
+
+static void setPhiPred(MachineInstr &PHI, unsigned Index,
+                       MachineBasicBlock *NewPred) {
+  PHI.getOperand(Index * 2 + 2).setMBB(NewPred);
+}
+
+static unsigned getPHISourceReg(MachineInstr &PHI, unsigned Index) {
+  assert(PHI.isPHI());
+  return PHI.getOperand(Index * 2 + 1).getReg();
+}
+
+static unsigned getPHIDestReg(MachineInstr &PHI) {
+  assert(PHI.isPHI());
+  return PHI.getOperand(0).getReg();
+}
+
+class LinearizedRegion {
+protected:
+  MachineBasicBlock *Entry;
+  // The exit block is part of the region, and is the last
+  // merge block before exiting the region.
+  MachineBasicBlock *Exit;
+  DenseSet<unsigned> LiveOuts;
+  SmallPtrSet<MachineBasicBlock *, 1> MBBs;
+  bool HasLoop;
+  LinearizedRegion *Parent;
+  RegionMRT *RMRT;
+
+  void storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+                       MachineInstr *DefInstr, const MachineRegisterInfo *MRI,
+                       const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  void storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+                             MachineInstr *DefInstr,
+                             const MachineRegisterInfo *MRI,
+                             const TargetRegisterInfo *TRI,
+                             PHILinearize &PHIInfo);
+
+  void storeMBBLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+                        const TargetRegisterInfo *TRI, PHILinearize &PHIInfo,
+                        RegionMRT *TopRegion);
+
+  void storeLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+                     const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  void storeLiveOuts(RegionMRT *Region, const MachineRegisterInfo *MRI,
+                     const TargetRegisterInfo *TRI, PHILinearize &PHIInfo,
+                     RegionMRT *TopRegion = nullptr);
+
+public:
+  void setRegionMRT(RegionMRT *Region) { RMRT = Region; }
+
+  RegionMRT *getRegionMRT() { return RMRT; }
+
+  void setParent(LinearizedRegion *P) { Parent = P; }
+
+  LinearizedRegion *getParent() { return Parent; }
+
+  void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr);
+
+  void setBBSelectRegIn(unsigned Reg);
+
+  unsigned getBBSelectRegIn();
+
+  void setBBSelectRegOut(unsigned Reg, bool IsLiveOut);
+
+  unsigned getBBSelectRegOut();
+
+  void setHasLoop(bool Value);
+
+  bool getHasLoop();
+
+  void addLiveOut(unsigned VReg);
+
+  void removeLiveOut(unsigned Reg);
+
+  void replaceLiveOut(unsigned OldReg, unsigned NewReg);
+
+  void replaceRegister(unsigned Register, unsigned NewRegister,
+                       MachineRegisterInfo *MRI, bool ReplaceInside,
+                       bool ReplaceOutside, bool IncludeLoopPHIs);
+
+  void replaceRegisterInsideRegion(unsigned Register, unsigned NewRegister,
+                                   bool IncludeLoopPHIs,
+                                   MachineRegisterInfo *MRI);
+
+  void replaceRegisterOutsideRegion(unsigned Register, unsigned NewRegister,
+                                    bool IncludeLoopPHIs,
+                                    MachineRegisterInfo *MRI);
+
+  DenseSet<unsigned> *getLiveOuts();
+
+  void setEntry(MachineBasicBlock *NewEntry);
+
+  MachineBasicBlock *getEntry();
+
+  void setExit(MachineBasicBlock *NewExit);
+
+  MachineBasicBlock *getExit();
+
+  void addMBB(MachineBasicBlock *MBB);
+
+  void addMBBs(LinearizedRegion *InnerRegion);
+
+  bool contains(MachineBasicBlock *MBB);
+
+  bool isLiveOut(unsigned Reg);
+
+  bool hasNoDef(unsigned Reg, MachineRegisterInfo *MRI);
+
+  void removeFalseRegisterKills(MachineRegisterInfo *MRI);
+
+  void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI,
+                   const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+                   const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  LinearizedRegion();
+
+  ~LinearizedRegion();
+};
+
+class MRT {
+protected:
+  RegionMRT *Parent;
+  unsigned BBSelectRegIn;
+  unsigned BBSelectRegOut;
+
+public:
+  unsigned getBBSelectRegIn() { return BBSelectRegIn; }
+
+  unsigned getBBSelectRegOut() { return BBSelectRegOut; }
+
+  void setBBSelectRegIn(unsigned Reg) { BBSelectRegIn = Reg; }
+
+  void setBBSelectRegOut(unsigned Reg) { BBSelectRegOut = Reg; }
+
+  virtual RegionMRT *getRegionMRT() { return nullptr; }
+
+  virtual MBBMRT *getMBBMRT() { return nullptr; }
+
+  bool isRegion() { return getRegionMRT() != nullptr; }
+
+  bool isMBB() { return getMBBMRT() != nullptr; }
+
+  bool isRoot() { return Parent == nullptr; }
+
+  void setParent(RegionMRT *Region) { Parent = Region; }
+
+  RegionMRT *getParent() { return Parent; }
+
+  static MachineBasicBlock *
+  initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo,
+                DenseMap<MachineRegion *, RegionMRT *> &RegionMap);
+
+  static RegionMRT *buildMRT(MachineFunction &MF,
+                             const MachineRegionInfo *RegionInfo,
+                             const SIInstrInfo *TII,
+                             MachineRegisterInfo *MRI);
+
+  virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) = 0;
+
+  void dumpDepth(int depth) {
+    for (int i = depth; i > 0; --i) {
+      dbgs() << "  ";
+    }
+  }
+
+  virtual ~MRT() {}
+};
+
+class MBBMRT : public MRT {
+  MachineBasicBlock *MBB;
+
+public:
+  virtual MBBMRT *getMBBMRT() { return this; }
+
+  MachineBasicBlock *getMBB() { return MBB; }
+
+  virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+    dumpDepth(depth);
+    dbgs() << "MBB: " << getMBB()->getNumber();
+    dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
+    dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
+  }
+
+  MBBMRT(MachineBasicBlock *BB) : MBB(BB) {
+    setParent(nullptr);
+    setBBSelectRegOut(0);
+    setBBSelectRegIn(0);
+  }
+};
+
+class RegionMRT : public MRT {
+protected:
+  MachineRegion *Region;
+  LinearizedRegion *LRegion;
+  MachineBasicBlock *Succ;
+
+  SetVector<MRT *> Children;
+
+public:
+  virtual RegionMRT *getRegionMRT() { return this; }
+
+  void setLinearizedRegion(LinearizedRegion *LinearizeRegion) {
+    LRegion = LinearizeRegion;
+  }
+
+  LinearizedRegion *getLinearizedRegion() { return LRegion; }
+
+  MachineRegion *getMachineRegion() { return Region; }
+
+  unsigned getInnerOutputRegister() {
+    return (*(Children.begin()))->getBBSelectRegOut();
+  }
+
+  void addChild(MRT *Tree) { Children.insert(Tree); }
+
+  SetVector<MRT *> *getChildren() { return &Children; }
+
+  virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+    dumpDepth(depth);
+    dbgs() << "Region: " << (void *)Region;
+    dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
+    dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
+
+    dumpDepth(depth);
+    if (getSucc())
+      dbgs() << "Succ: " << getSucc()->getNumber() << "\n";
+    else
+      dbgs() << "Succ: none \n";
+    for (auto MRTI : Children) {
+      MRTI->dump(TRI, depth + 1);
+    }
+  }
+
+  MRT *getEntryTree() { return Children.back(); }
+
+  MRT *getExitTree() { return Children.front(); }
+
+  MachineBasicBlock *getEntry() {
+    MRT *Tree = Children.back();
+    return (Tree->isRegion()) ? Tree->getRegionMRT()->getEntry()
+                              : Tree->getMBBMRT()->getMBB();
+  }
+
+  MachineBasicBlock *getExit() {
+    MRT *Tree = Children.front();
+    return (Tree->isRegion()) ? Tree->getRegionMRT()->getExit()
+                              : Tree->getMBBMRT()->getMBB();
+  }
+
+  void setSucc(MachineBasicBlock *MBB) { Succ = MBB; }
+
+  MachineBasicBlock *getSucc() { return Succ; }
+
+  bool contains(MachineBasicBlock *MBB) {
+    for (auto CI : Children) {
+      if (CI->isMBB()) {
+        if (MBB == CI->getMBBMRT()->getMBB()) {
+          return true;
+        }
+      } else {
+        if (CI->getRegionMRT()->contains(MBB)) {
+          return true;
+        } else if (CI->getRegionMRT()->getLinearizedRegion() != nullptr &&
+                   CI->getRegionMRT()->getLinearizedRegion()->contains(MBB)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  void replaceLiveOutReg(unsigned Register, unsigned NewRegister) {
+    LinearizedRegion *LRegion = getLinearizedRegion();
+    LRegion->replaceLiveOut(Register, NewRegister);
+    for (auto &CI : Children) {
+      if (CI->isRegion()) {
+        CI->getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
+      }
+    }
+  }
+
+  RegionMRT(MachineRegion *MachineRegion)
+      : Region(MachineRegion), LRegion(nullptr), Succ(nullptr) {
+    setParent(nullptr);
+    setBBSelectRegOut(0);
+    setBBSelectRegIn(0);
+  }
+
+  virtual ~RegionMRT() {
+    if (LRegion) {
+      delete LRegion;
+    }
+
+    for (auto CI : Children) {
+      delete &(*CI);
+    }
+  }
+};
+
+static unsigned createBBSelectReg(const SIInstrInfo *TII,
+                                  MachineRegisterInfo *MRI) {
+  return MRI->createVirtualRegister(TII->getPreferredSelectRegClass(32));
+}
+
+MachineBasicBlock *
+MRT::initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo,
+                   DenseMap<MachineRegion *, RegionMRT *> &RegionMap) {
+  for (auto &MFI : MF) {
+    MachineBasicBlock *ExitMBB = &MFI;
+    if (ExitMBB->succ_size() == 0) {
+      return ExitMBB;
+    }
+  }
+  llvm_unreachable("CFG has no exit block");
+  return nullptr;
+}
+
+RegionMRT *MRT::buildMRT(MachineFunction &MF,
+                         const MachineRegionInfo *RegionInfo,
+                         const SIInstrInfo *TII, MachineRegisterInfo *MRI) {
+  SmallPtrSet<MachineRegion *, 4> PlacedRegions;
+  DenseMap<MachineRegion *, RegionMRT *> RegionMap;
+  MachineRegion *TopLevelRegion = RegionInfo->getTopLevelRegion();
+  RegionMRT *Result = new RegionMRT(TopLevelRegion);
+  RegionMap[TopLevelRegion] = Result;
+
+  // Insert the exit block first, we need it to be the merge node
+  // for the top level region.
+  MachineBasicBlock *Exit = initializeMRT(MF, RegionInfo, RegionMap);
+
+  unsigned BBSelectRegIn = createBBSelectReg(TII, MRI);
+  MBBMRT *ExitMRT = new MBBMRT(Exit);
+  RegionMap[RegionInfo->getRegionFor(Exit)]->addChild(ExitMRT);
+  ExitMRT->setBBSelectRegIn(BBSelectRegIn);
+
+  for (auto MBBI : post_order(&(MF.front()))) {
+    MachineBasicBlock *MBB = &(*MBBI);
+
+    // Skip Exit since we already added it
+    if (MBB == Exit) {
+      continue;
+    }
+
+    DEBUG(dbgs() << "Visiting BB#" << MBB->getNumber() << "\n");
+    MBBMRT *NewMBB = new MBBMRT(MBB);
+    MachineRegion *Region = RegionInfo->getRegionFor(MBB);
+
+    // Ensure we have the MRT region
+    if (RegionMap.count(Region) == 0) {
+      RegionMRT *NewMRTRegion = new RegionMRT(Region);
+      RegionMap[Region] = NewMRTRegion;
+
+      // Ensure all parents are in the RegionMap
+      MachineRegion *Parent = Region->getParent();
+      while (RegionMap.count(Parent) == 0) {
+        RegionMRT *NewMRTParent = new RegionMRT(Parent);
+        NewMRTParent->addChild(NewMRTRegion);
+        NewMRTRegion->setParent(NewMRTParent);
+        RegionMap[Parent] = NewMRTParent;
+        NewMRTRegion = NewMRTParent;
+        Parent = Parent->getParent();
+      }
+      RegionMap[Parent]->addChild(NewMRTRegion);
+      NewMRTRegion->setParent(RegionMap[Parent]);
+    }
+
+    // Add MBB to Region MRT
+    RegionMap[Region]->addChild(NewMBB);
+    NewMBB->setParent(RegionMap[Region]);
+    RegionMap[Region]->setSucc(Region->getExit());
+  }
+  return Result;
+}
+
+void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+                                       MachineInstr *DefInstr,
+                                       const MachineRegisterInfo *MRI,
+                                       const TargetRegisterInfo *TRI,
+                                       PHILinearize &PHIInfo) {
+  if (TRI->isVirtualRegister(Reg)) {
+    DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+    // If this is a source register to a PHI we are chaining, it
+    // must be live out.
+    if (PHIInfo.isSource(Reg)) {
+      DEBUG(dbgs() << "Add LiveOut (PHI): " << PrintReg(Reg, TRI) << "\n");
+      addLiveOut(Reg);
+    } else {
+      // If this is live out of the MBB
+      for (auto &UI : MRI->use_operands(Reg)) {
+        if (UI.getParent()->getParent() != MBB) {
+          DEBUG(dbgs() << "Add LiveOut (MBB BB#" << MBB->getNumber()
+                       << "): " << PrintReg(Reg, TRI) << "\n");
+          addLiveOut(Reg);
+        } else {
+          // If the use is in the same MBB we have to make sure
+          // it is after the def, otherwise it is live out in a loop
+          MachineInstr *UseInstr = UI.getParent();
+          for (MachineBasicBlock::instr_iterator
+                   MII = UseInstr->getIterator(),
+                   MIE = UseInstr->getParent()->instr_end();
+               MII != MIE; ++MII) {
+            if ((&(*MII)) == DefInstr) {
+              DEBUG(dbgs() << "Add LiveOut (Loop): " << PrintReg(Reg, TRI)
+                           << "\n");
+              addLiveOut(Reg);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+                                             MachineInstr *DefInstr,
+                                             const MachineRegisterInfo *MRI,
+                                             const TargetRegisterInfo *TRI,
+                                             PHILinearize &PHIInfo) {
+  if (TRI->isVirtualRegister(Reg)) {
+    DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+    for (auto &UI : MRI->use_operands(Reg)) {
+      if (!Region->contains(UI.getParent()->getParent())) {
+        DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
+                     << "): " << PrintReg(Reg, TRI) << "\n");
+        addLiveOut(Reg);
+      }
+    }
+  }
+}
+
+void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
+                                     const MachineRegisterInfo *MRI,
+                                     const TargetRegisterInfo *TRI,
+                                     PHILinearize &PHIInfo) {
+  DEBUG(dbgs() << "-Store Live Outs Begin (BB#" << MBB->getNumber() << ")-\n");
+  for (auto &II : *MBB) {
+    for (auto &RI : II.defs()) {
+      storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo);
+    }
+    for (auto &IRI : II.implicit_operands()) {
+      if (IRI.isDef()) {
+        storeLiveOutReg(MBB, IRI.getReg(), IRI.getParent(), MRI, TRI, PHIInfo);
+      }
+    }
+  }
+
+  // If we have a successor with a PHI, source coming from this MBB we have to
+  // add the register as live out
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+                                        E = MBB->succ_end();
+       SI != E; ++SI) {
+    for (auto &II : *(*SI)) {
+      if (II.isPHI()) {
+        MachineInstr &PHI = II;
+        int numPreds = getPHINumInputs(PHI);
+        for (int i = 0; i < numPreds; ++i) {
+          if (getPHIPred(PHI, i) == MBB) {
+            unsigned PHIReg = getPHISourceReg(PHI, i);
+            DEBUG(dbgs() << "Add LiveOut (PhiSource BB#" << MBB->getNumber()
+                         << " -> BB#" << (*SI)->getNumber()
+                         << "): " << PrintReg(PHIReg, TRI) << "\n");
+            addLiveOut(PHIReg);
+          }
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "-Store Live Outs Endn-\n");
+}
+
+void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB,
+                                        const MachineRegisterInfo *MRI,
+                                        const TargetRegisterInfo *TRI,
+                                        PHILinearize &PHIInfo,
+                                        RegionMRT *TopRegion) {
+  for (auto &II : *MBB) {
+    for (auto &RI : II.defs()) {
+      storeLiveOutRegRegion(TopRegion, RI.getReg(), RI.getParent(), MRI, TRI,
+                            PHIInfo);
+    }
+    for (auto &IRI : II.implicit_operands()) {
+      if (IRI.isDef()) {
+        storeLiveOutRegRegion(TopRegion, IRI.getReg(), IRI.getParent(), MRI,
+                              TRI, PHIInfo);
+      }
+    }
+  }
+}
+
+void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
+                                     const MachineRegisterInfo *MRI,
+                                     const TargetRegisterInfo *TRI,
+                                     PHILinearize &PHIInfo,
+                                     RegionMRT *CurrentTopRegion) {
+  MachineBasicBlock *Exit = Region->getSucc();
+
+  RegionMRT *TopRegion =
+      CurrentTopRegion == nullptr ? Region : CurrentTopRegion;
+
+  // Check if exit is end of function, if so, no live outs.
+  if (Exit == nullptr)
+    return;
+
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (CI->isMBB()) {
+      auto MBB = CI->getMBBMRT()->getMBB();
+      storeMBBLiveOuts(MBB, MRI, TRI, PHIInfo, TopRegion);
+    } else {
+      LinearizedRegion *SubRegion = CI->getRegionMRT()->getLinearizedRegion();
+      // We should be limited to only store registers that are live out from the
+      // lineaized region
+      for (auto MBBI : SubRegion->MBBs) {
+        storeMBBLiveOuts(MBBI, MRI, TRI, PHIInfo, TopRegion);
+      }
+    }
+  }
+
+  if (CurrentTopRegion == nullptr) {
+    auto Succ = Region->getSucc();
+    for (auto &II : *Succ) {
+      if (II.isPHI()) {
+        MachineInstr &PHI = II;
+        int numPreds = getPHINumInputs(PHI);
+        for (int i = 0; i < numPreds; ++i) {
+          if (Region->contains(getPHIPred(PHI, i))) {
+            unsigned PHIReg = getPHISourceReg(PHI, i);
+            DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
+                         << "): " << PrintReg(PHIReg, TRI) << "\n");
+            addLiveOut(PHIReg);
+          }
+        }
+      }
+    }
+  }
+}
+
+void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
+  OS << "Linearized Region {";
+  bool IsFirst = true;
+  for (const auto &MBB : MBBs) {
+    if (IsFirst) {
+      IsFirst = false;
+    } else {
+      OS << " ,";
+    }
+    OS << MBB->getNumber();
+  }
+  OS << "} (" << Entry->getNumber() << ", "
+     << (Exit == nullptr ? -1 : Exit->getNumber())
+     << "): In:" << PrintReg(getBBSelectRegIn(), TRI)
+     << " Out:" << PrintReg(getBBSelectRegOut(), TRI) << " {";
+  for (auto &LI : LiveOuts) {
+    OS << PrintReg(LI, TRI) << " ";
+  }
+  OS << "} \n";
+}
+
+unsigned LinearizedRegion::getBBSelectRegIn() {
+  return getRegionMRT()->getBBSelectRegIn();
+}
+
+unsigned LinearizedRegion::getBBSelectRegOut() {
+  return getRegionMRT()->getBBSelectRegOut();
+}
+
+void LinearizedRegion::setHasLoop(bool Value) { HasLoop = Value; }
+
+bool LinearizedRegion::getHasLoop() { return HasLoop; }
+
+void LinearizedRegion::addLiveOut(unsigned VReg) { LiveOuts.insert(VReg); }
+
+void LinearizedRegion::removeLiveOut(unsigned Reg) {
+  if (isLiveOut(Reg))
+    LiveOuts.erase(Reg);
+}
+
+void LinearizedRegion::replaceLiveOut(unsigned OldReg, unsigned NewReg) {
+  if (isLiveOut(OldReg)) {
+    removeLiveOut(OldReg);
+    addLiveOut(NewReg);
+  }
+}
+
+void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
+                                       MachineRegisterInfo *MRI,
+                                       bool ReplaceInside, bool ReplaceOutside,
+                                       bool IncludeLoopPHI) {
+  assert(Register != NewRegister && "Cannot replace a reg with itself");
+
+  DEBUG(dbgs() << "Pepareing to replace register (region): "
+               << PrintReg(Register, MRI->getTargetRegisterInfo()) << " with "
+               << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
+
+  // If we are replacing outside, we also need to update the LiveOuts
+  if (ReplaceOutside &&
+      (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) {
+    LinearizedRegion *Current = this;
+    while (Current != nullptr && Current->getEntry() != nullptr) {
+      DEBUG(dbgs() << "Region before register replace\n");
+      DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+      Current->replaceLiveOut(Register, NewRegister);
+      DEBUG(dbgs() << "Region after register replace\n");
+      DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+      Current = Current->getParent();
+    }
+  }
+
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
+                                         E = MRI->reg_end();
+       I != E;) {
+    MachineOperand &O = *I;
+    ++I;
+
+    // We don't rewrite defs.
+    if (O.isDef())
+      continue;
+
+    bool IsInside = contains(O.getParent()->getParent());
+    bool IsLoopPHI = IsInside && (O.getParent()->isPHI() &&
+                                  O.getParent()->getParent() == getEntry());
+    bool ShouldReplace = (IsInside && ReplaceInside) ||
+                         (!IsInside && ReplaceOutside) ||
+                         (IncludeLoopPHI && IsLoopPHI);
+    if (ShouldReplace) {
+
+      if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+        DEBUG(dbgs() << "Trying to substitute physical register: "
+                     << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                     << "\n");
+        llvm_unreachable("Cannot substitute physical registers");
+      } else {
+        DEBUG(dbgs() << "Replacing register (region): "
+                     << PrintReg(Register, MRI->getTargetRegisterInfo())
+                     << " with "
+                     << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                     << "\n");
+        O.setReg(NewRegister);
+      }
+    }
+  }
+}
+
+void LinearizedRegion::replaceRegisterInsideRegion(unsigned Register,
+                                                   unsigned NewRegister,
+                                                   bool IncludeLoopPHIs,
+                                                   MachineRegisterInfo *MRI) {
+  replaceRegister(Register, NewRegister, MRI, true, false, IncludeLoopPHIs);
+}
+
+void LinearizedRegion::replaceRegisterOutsideRegion(unsigned Register,
+                                                    unsigned NewRegister,
+                                                    bool IncludeLoopPHIs,
+                                                    MachineRegisterInfo *MRI) {
+  replaceRegister(Register, NewRegister, MRI, false, true, IncludeLoopPHIs);
+}
+
+DenseSet<unsigned> *LinearizedRegion::getLiveOuts() { return &LiveOuts; }
+
+void LinearizedRegion::setEntry(MachineBasicBlock *NewEntry) {
+  Entry = NewEntry;
+}
+
+MachineBasicBlock *LinearizedRegion::getEntry() { return Entry; }
+
+void LinearizedRegion::setExit(MachineBasicBlock *NewExit) { Exit = NewExit; }
+
+MachineBasicBlock *LinearizedRegion::getExit() { return Exit; }
+
+void LinearizedRegion::addMBB(MachineBasicBlock *MBB) { MBBs.insert(MBB); }
+
+void LinearizedRegion::addMBBs(LinearizedRegion *InnerRegion) {
+  for (const auto &MBB : InnerRegion->MBBs) {
+    addMBB(MBB);
+  }
+}
+
+bool LinearizedRegion::contains(MachineBasicBlock *MBB) {
+  return MBBs.count(MBB) == 1;
+}
+
+bool LinearizedRegion::isLiveOut(unsigned Reg) {
+  return LiveOuts.count(Reg) == 1;
+}
+
+bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) {
+  return MRI->def_begin(Reg) == MRI->def_end();
+}
+
+// After the code has been structurized, what was flagged as kills
+// before are no longer register kills.
+void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
+  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  for (auto MBBI : MBBs) {
+    MachineBasicBlock *MBB = MBBI;
+    for (auto &II : *MBB) {
+      for (auto &RI : II.uses()) {
+        if (RI.isReg()) {
+          unsigned Reg = RI.getReg();
+          if (TRI->isVirtualRegister(Reg)) {
+            if (hasNoDef(Reg, MRI))
+              continue;
+            if (!MRI->hasOneDef(Reg)) {
+              DEBUG(this->getEntry()->getParent()->dump());
+              DEBUG(dbgs() << PrintReg(Reg, TRI) << "\n");
+            }
+
+            if (MRI->def_begin(Reg) == MRI->def_end()) {
+              DEBUG(dbgs() << "Register "
+                           << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                           << " has NO defs\n");
+            } else if (!MRI->hasOneDef(Reg)) {
+              DEBUG(dbgs() << "Register "
+                           << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                           << " has multiple defs\n");
+            }
+
+            assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
+            MachineOperand *Def = &(*(MRI->def_begin(Reg)));
+            MachineOperand *UseOperand = &(RI);
+            bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB;
+            if (UseIsOutsideDefMBB && UseOperand->isKill()) {
+              DEBUG(dbgs() << "Removing kill flag on register: "
+                           << PrintReg(Reg, TRI) << "\n");
+              UseOperand->setIsKill(false);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void LinearizedRegion::initLiveOut(RegionMRT *Region,
+                                   const MachineRegisterInfo *MRI,
+                                   const TargetRegisterInfo *TRI,
+                                   PHILinearize &PHIInfo) {
+  storeLiveOuts(Region, MRI, TRI, PHIInfo);
+}
+
+LinearizedRegion::LinearizedRegion(MachineBasicBlock *MBB,
+                                   const MachineRegisterInfo *MRI,
+                                   const TargetRegisterInfo *TRI,
+                                   PHILinearize &PHIInfo) {
+  setEntry(MBB);
+  setExit(MBB);
+  storeLiveOuts(MBB, MRI, TRI, PHIInfo);
+  MBBs.insert(MBB);
+  Parent = nullptr;
+}
+
+LinearizedRegion::LinearizedRegion() {
+  setEntry(nullptr);
+  setExit(nullptr);
+  Parent = nullptr;
+}
+
+LinearizedRegion::~LinearizedRegion() {}
+
+class AMDGPUMachineCFGStructurizer : public MachineFunctionPass {
+private:
+  const MachineRegionInfo *Regions;
+  const SIInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  unsigned BBSelectRegister;
+  PHILinearize PHIInfo;
+  DenseMap<MachineBasicBlock *, MachineBasicBlock *> FallthroughMap;
+
+  void getPHIRegionIndices(RegionMRT *Region, MachineInstr &PHI,
+                           SmallVector<unsigned, 2> &RegionIndices);
+  void getPHIRegionIndices(LinearizedRegion *Region, MachineInstr &PHI,
+                           SmallVector<unsigned, 2> &RegionIndices);
+  void getPHINonRegionIndices(LinearizedRegion *Region, MachineInstr &PHI,
+                              SmallVector<unsigned, 2> &PHINonRegionIndices);
+
+  void storePHILinearizationInfoDest(
+      unsigned LDestReg, MachineInstr &PHI,
+      SmallVector<unsigned, 2> *RegionIndices = nullptr);
+
+  unsigned storePHILinearizationInfo(MachineInstr &PHI,
+                                     SmallVector<unsigned, 2> *RegionIndices);
+
+  void extractKilledPHIs(MachineBasicBlock *MBB);
+
+  bool shrinkPHI(MachineInstr &PHI, SmallVector<unsigned, 2> &PHIIndices,
+                 unsigned *ReplaceReg);
+
+  bool shrinkPHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+                 MachineBasicBlock *SourceMBB,
+                 SmallVector<unsigned, 2> &PHIIndices, unsigned *ReplaceReg);
+
+  void replacePHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+                  MachineBasicBlock *LastMerge,
+                  SmallVector<unsigned, 2> &PHIRegionIndices);
+  void replaceEntryPHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+                       MachineBasicBlock *IfMBB,
+                       SmallVector<unsigned, 2> &PHIRegionIndices);
+  void replaceLiveOutRegs(MachineInstr &PHI,
+                          SmallVector<unsigned, 2> &PHIRegionIndices,
+                          unsigned CombinedSourceReg,
+                          LinearizedRegion *LRegion);
+  void rewriteRegionExitPHI(RegionMRT *Region, MachineBasicBlock *LastMerge,
+                            MachineInstr &PHI, LinearizedRegion *LRegion);
+
+  void rewriteRegionExitPHIs(RegionMRT *Region, MachineBasicBlock *LastMerge,
+                             LinearizedRegion *LRegion);
+  void rewriteRegionEntryPHI(LinearizedRegion *Region, MachineBasicBlock *IfMBB,
+                             MachineInstr &PHI);
+  void rewriteRegionEntryPHIs(LinearizedRegion *Region,
+                              MachineBasicBlock *IfMBB);
+
+  bool regionIsSimpleIf(RegionMRT *Region);
+
+  void transformSimpleIfRegion(RegionMRT *Region);
+
+  void eliminateDeadBranchOperands(MachineBasicBlock::instr_iterator &II);
+
+  void insertUnconditionalBranch(MachineBasicBlock *MBB,
+                                 MachineBasicBlock *Dest,
+                                 const DebugLoc &DL = DebugLoc());
+
+  MachineBasicBlock *createLinearizedExitBlock(RegionMRT *Region);
+
+  void insertMergePHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+                      MachineBasicBlock *MergeBB, unsigned DestRegister,
+                      unsigned IfSourceRegister, unsigned CodeSourceRegister,
+                      bool IsUndefIfSource = false);
+
+  MachineBasicBlock *createIfBlock(MachineBasicBlock *MergeBB,
+                                   MachineBasicBlock *CodeBBStart,
+                                   MachineBasicBlock *CodeBBEnd,
+                                   MachineBasicBlock *SelectBB, unsigned IfReg,
+                                   bool InheritPreds);
+
+  void prunePHIInfo(MachineBasicBlock *MBB);
+  void createEntryPHI(LinearizedRegion *CurrentRegion, unsigned DestReg);
+
+  void createEntryPHIs(LinearizedRegion *CurrentRegion);
+  void resolvePHIInfos(MachineBasicBlock *FunctionEntry);
+
+  void replaceRegisterWith(unsigned Register, unsigned NewRegister);
+
+  MachineBasicBlock *createIfRegion(MachineBasicBlock *MergeBB,
+                                    MachineBasicBlock *CodeBB,
+                                    LinearizedRegion *LRegion,
+                                    unsigned BBSelectRegIn,
+                                    unsigned BBSelectRegOut);
+
+  MachineBasicBlock *
+  createIfRegion(MachineBasicBlock *MergeMBB, LinearizedRegion *InnerRegion,
+                 LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB,
+                 unsigned BBSelectRegIn, unsigned BBSelectRegOut);
+  void ensureCondIsNotKilled(SmallVector<MachineOperand, 1> Cond);
+
+  void rewriteCodeBBTerminator(MachineBasicBlock *CodeBB,
+                               MachineBasicBlock *MergeBB,
+                               unsigned BBSelectReg);
+
+  MachineInstr *getDefInstr(unsigned Reg);
+  void insertChainedPHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+                        MachineBasicBlock *MergeBB,
+                        LinearizedRegion *InnerRegion, unsigned DestReg,
+                        unsigned SourceReg);
+  bool containsDef(MachineBasicBlock *MBB, LinearizedRegion *InnerRegion,
+                   unsigned Register);
+  void rewriteLiveOutRegs(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+                          MachineBasicBlock *MergeBB,
+                          LinearizedRegion *InnerRegion,
+                          LinearizedRegion *LRegion);
+
+  void splitLoopPHI(MachineInstr &PHI, MachineBasicBlock *Entry,
+                    MachineBasicBlock *EntrySucc, LinearizedRegion *LRegion);
+  void splitLoopPHIs(MachineBasicBlock *Entry, MachineBasicBlock *EntrySucc,
+                     LinearizedRegion *LRegion);
+
+  MachineBasicBlock *splitExit(LinearizedRegion *LRegion);
+
+  MachineBasicBlock *splitEntry(LinearizedRegion *LRegion);
+
+  LinearizedRegion *initLinearizedRegion(RegionMRT *Region);
+
+  bool structurizeComplexRegion(RegionMRT *Region);
+
+  bool structurizeRegion(RegionMRT *Region);
+
+  bool structurizeRegions(RegionMRT *Region, bool isTopRegion);
+
+public:
+  static char ID;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineRegionInfoPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+    AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) {
+      initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry());
+    }
+
+  void initFallthroughMap(MachineFunction &MF);
+
+  void createLinearizedRegion(RegionMRT *Region, unsigned SelectOut);
+
+  unsigned initializeSelectRegisters(MRT *MRT, unsigned ExistingExitReg,
+                                     MachineRegisterInfo *MRI,
+                                     const SIInstrInfo *TII);
+
+  RegionMRT *RMRT;
+  void setRegionMRT(RegionMRT *RegionTree) { RMRT = RegionTree; }
+
+  RegionMRT *getRegionMRT() { return RMRT; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+}
+
+char AMDGPUMachineCFGStructurizer::ID = 0;
+
+bool AMDGPUMachineCFGStructurizer::regionIsSimpleIf(RegionMRT *Region) {
+  MachineBasicBlock *Entry = Region->getEntry();
+  MachineBasicBlock *Succ = Region->getSucc();
+  bool FoundBypass = false;
+  bool FoundIf = false;
+
+  if (Entry->succ_size() != 2) {
+    return false;
+  }
+
+  for (MachineBasicBlock::const_succ_iterator SI = Entry->succ_begin(),
+                                              E = Entry->succ_end();
+       SI != E; ++SI) {
+    MachineBasicBlock *Current = *SI;
+
+    if (Current == Succ) {
+      FoundBypass = true;
+    } else if ((Current->succ_size() == 1) &&
+               *(Current->succ_begin()) == Succ) {
+      FoundIf = true;
+    }
+  }
+
+  return FoundIf && FoundBypass;
+}
+
+void AMDGPUMachineCFGStructurizer::transformSimpleIfRegion(RegionMRT *Region) {
+  MachineBasicBlock *Entry = Region->getEntry();
+  MachineBasicBlock *Exit = Region->getExit();
+  TII->convertNonUniformIfRegion(Entry, Exit);
+}
+
+static void fixMBBTerminator(MachineBasicBlock *MBB) {
+
+  if (MBB->succ_size() == 1) {
+    auto *Succ = *(MBB->succ_begin());
+    for (auto &TI : MBB->terminators()) {
+      for (auto &UI : TI.uses()) {
+        if (UI.isMBB() && UI.getMBB() != Succ) {
+          UI.setMBB(Succ);
+        }
+      }
+    }
+  }
+}
+
+static void fixRegionTerminator(RegionMRT *Region) {
+  MachineBasicBlock *InternalSucc = nullptr;
+  MachineBasicBlock *ExternalSucc = nullptr;
+  LinearizedRegion *LRegion = Region->getLinearizedRegion();
+  auto Exit = LRegion->getExit();
+
+  SmallPtrSet<MachineBasicBlock *, 2> Successors;
+  for (MachineBasicBlock::const_succ_iterator SI = Exit->succ_begin(),
+                                              SE = Exit->succ_end();
+       SI != SE; ++SI) {
+    MachineBasicBlock *Succ = *SI;
+    if (LRegion->contains(Succ)) {
+      // Do not allow re-assign
+      assert(InternalSucc == nullptr);
+      InternalSucc = Succ;
+    } else {
+      // Do not allow re-assign
+      assert(ExternalSucc == nullptr);
+      ExternalSucc = Succ;
+    }
+  }
+
+  for (auto &TI : Exit->terminators()) {
+    for (auto &UI : TI.uses()) {
+      if (UI.isMBB()) {
+        auto Target = UI.getMBB();
+        if (Target != InternalSucc && Target != ExternalSucc) {
+          UI.setMBB(ExternalSucc);
+        }
+      }
+    }
+  }
+}
+
+// If a region region is just a sequence of regions (and the exit
+// block in the case of the top level region), we can simply skip
+// linearizing it, because it is already linear
+bool regionIsSequence(RegionMRT *Region) {
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (!CI->isRegion()) {
+      if (CI->getMBBMRT()->getMBB()->succ_size() > 1) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void fixupRegionExits(RegionMRT *Region) {
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (!CI->isRegion()) {
+      fixMBBTerminator(CI->getMBBMRT()->getMBB());
+    } else {
+      fixRegionTerminator(CI->getRegionMRT());
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHIRegionIndices(
+    RegionMRT *Region, MachineInstr &PHI,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    MachineBasicBlock *Pred = getPHIPred(PHI, i);
+    if (Region->contains(Pred)) {
+      PHIRegionIndices.push_back(i);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHIRegionIndices(
+    LinearizedRegion *Region, MachineInstr &PHI,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    MachineBasicBlock *Pred = getPHIPred(PHI, i);
+    if (Region->contains(Pred)) {
+      PHIRegionIndices.push_back(i);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHINonRegionIndices(
+    LinearizedRegion *Region, MachineInstr &PHI,
+    SmallVector<unsigned, 2> &PHINonRegionIndices) {
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    MachineBasicBlock *Pred = getPHIPred(PHI, i);
+    if (!Region->contains(Pred)) {
+      PHINonRegionIndices.push_back(i);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest(
+    unsigned LDestReg, MachineInstr &PHI,
+    SmallVector<unsigned, 2> *RegionIndices) {
+  if (RegionIndices) {
+    for (auto i : *RegionIndices) {
+      PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i));
+    }
+  } else {
+    unsigned NumInputs = getPHINumInputs(PHI);
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i));
+    }
+  }
+}
+
+unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo(
+    MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) {
+  unsigned DestReg = getPHIDestReg(PHI);
+  unsigned LinearizeDestReg =
+      MRI->createVirtualRegister(MRI->getRegClass(DestReg));
+  PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc());
+  storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices);
+  return LinearizeDestReg;
+}
+
+void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) {
+  // We need to create a new chain for the killed phi, but there is no
+  // need to do the renaming outside or inside the block.
+  SmallPtrSet<MachineInstr *, 2> PHIs;
+  for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+                                         E = MBB->instr_end();
+       I != E; ++I) {
+    MachineInstr &Instr = *I;
+    if (Instr.isPHI()) {
+      unsigned PHIDestReg = getPHIDestReg(Instr);
+      DEBUG(dbgs() << "Extractking killed phi:\n");
+      DEBUG(Instr.dump());
+      PHIs.insert(&Instr);
+      PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc());
+      storePHILinearizationInfoDest(PHIDestReg, Instr);
+    }
+  }
+
+  for (auto PI : PHIs) {
+    PI->eraseFromParent();
+  }
+}
+
+static bool isPHIRegionIndex(SmallVector<unsigned, 2> PHIRegionIndices,
+                             unsigned Index) {
+  for (auto i : PHIRegionIndices) {
+    if (i == Index)
+      return true;
+  }
+  return false;
+}
+
+bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
+                                       SmallVector<unsigned, 2> &PHIIndices,
+                                       unsigned *ReplaceReg) {
+  return shrinkPHI(PHI, 0, nullptr, PHIIndices, ReplaceReg);
+}
+
+bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
+                                       unsigned CombinedSourceReg,
+                                       MachineBasicBlock *SourceMBB,
+                                       SmallVector<unsigned, 2> &PHIIndices,
+                                       unsigned *ReplaceReg) {
+  DEBUG(dbgs() << "Shrink PHI: ");
+  DEBUG(PHI.dump());
+  DEBUG(dbgs() << " to " << PrintReg(getPHIDestReg(PHI), TRI)
+               << "<def> = PHI(");
+
+  bool Replaced = false;
+  unsigned NumInputs = getPHINumInputs(PHI);
+  int SingleExternalEntryIndex = -1;
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    if (!isPHIRegionIndex(PHIIndices, i)) {
+      if (SingleExternalEntryIndex == -1) {
+        // Single entry
+        SingleExternalEntryIndex = i;
+      } else {
+        // Multiple entries
+        SingleExternalEntryIndex = -2;
+      }
+    }
+  }
+
+  if (SingleExternalEntryIndex > -1) {
+    *ReplaceReg = getPHISourceReg(PHI, SingleExternalEntryIndex);
+    // We should not rewrite the code, we should only pick up the single value
+    // that represents the shrunk PHI.
+    Replaced = true;
+  } else {
+    MachineBasicBlock *MBB = PHI.getParent();
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+                getPHIDestReg(PHI));
+    if (SourceMBB) {
+      MIB.addReg(CombinedSourceReg);
+      MIB.addMBB(SourceMBB);
+      DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+                   << SourceMBB->getNumber());
+    }
+
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      if (isPHIRegionIndex(PHIIndices, i)) {
+        continue;
+      }
+      unsigned SourceReg = getPHISourceReg(PHI, i);
+      MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+      MIB.addReg(SourceReg);
+      MIB.addMBB(SourcePred);
+      DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                   << SourcePred->getNumber());
+    }
+    DEBUG(dbgs() << ")\n");
+  }
+  PHI.eraseFromParent();
+  return Replaced;
+}
+
+void AMDGPUMachineCFGStructurizer::replacePHI(
+    MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+  DEBUG(dbgs() << "Replace PHI: ");
+  DEBUG(PHI.dump());
+  DEBUG(dbgs() << " with " << PrintReg(getPHIDestReg(PHI), TRI)
+               << "<def> = PHI(");
+
+  bool HasExternalEdge = false;
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    if (!isPHIRegionIndex(PHIRegionIndices, i)) {
+      HasExternalEdge = true;
+    }
+  }
+
+  if (HasExternalEdge) {
+    MachineBasicBlock *MBB = PHI.getParent();
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+                getPHIDestReg(PHI));
+    MIB.addReg(CombinedSourceReg);
+    MIB.addMBB(LastMerge);
+    DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+                 << LastMerge->getNumber());
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      if (isPHIRegionIndex(PHIRegionIndices, i)) {
+        continue;
+      }
+      unsigned SourceReg = getPHISourceReg(PHI, i);
+      MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+      MIB.addReg(SourceReg);
+      MIB.addMBB(SourcePred);
+      DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                   << SourcePred->getNumber());
+    }
+    DEBUG(dbgs() << ")\n");
+  } else {
+    replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg);
+  }
+  PHI.eraseFromParent();
+}
+
+void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
+    MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+
+  DEBUG(dbgs() << "Replace entry PHI: ");
+  DEBUG(PHI.dump());
+  DEBUG(dbgs() << " with ");
+
+  unsigned NumInputs = getPHINumInputs(PHI);
+  unsigned NumNonRegionInputs = NumInputs;
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    if (isPHIRegionIndex(PHIRegionIndices, i)) {
+      NumNonRegionInputs--;
+    }
+  }
+
+  if (NumNonRegionInputs == 0) {
+    auto DestReg = getPHIDestReg(PHI);
+    replaceRegisterWith(DestReg, CombinedSourceReg);
+    DEBUG(dbgs() << " register " << PrintReg(CombinedSourceReg, TRI) << "\n");
+    PHI.eraseFromParent();
+  } else {
+    DEBUG(dbgs() << PrintReg(getPHIDestReg(PHI), TRI) << "<def> = PHI(");
+    MachineBasicBlock *MBB = PHI.getParent();
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+                getPHIDestReg(PHI));
+    MIB.addReg(CombinedSourceReg);
+    MIB.addMBB(IfMBB);
+    DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+                 << IfMBB->getNumber());
+    unsigned NumInputs = getPHINumInputs(PHI);
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      if (isPHIRegionIndex(PHIRegionIndices, i)) {
+        continue;
+      }
+      unsigned SourceReg = getPHISourceReg(PHI, i);
+      MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+      MIB.addReg(SourceReg);
+      MIB.addMBB(SourcePred);
+      DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                   << SourcePred->getNumber());
+    }
+    DEBUG(dbgs() << ")\n");
+    PHI.eraseFromParent();
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs(
+    MachineInstr &PHI, SmallVector<unsigned, 2> &PHIRegionIndices,
+    unsigned CombinedSourceReg, LinearizedRegion *LRegion) {
+  bool WasLiveOut = false;
+  for (auto PII : PHIRegionIndices) {
+    unsigned Reg = getPHISourceReg(PHI, PII);
+    if (LRegion->isLiveOut(Reg)) {
+      bool IsDead = true;
+
+      // Check if register is live out of the basic block
+      MachineBasicBlock *DefMBB = getDefInstr(Reg)->getParent();
+      for (auto UI = MRI->use_begin(Reg), E = MRI->use_end(); UI != E; ++UI) {
+        if ((*UI).getParent()->getParent() != DefMBB) {
+          IsDead = false;
+        }
+      }
+
+      DEBUG(dbgs() << "Register " << PrintReg(Reg, TRI) << " is "
+                   << (IsDead ? "dead" : "alive") << " after PHI replace\n");
+      if (IsDead) {
+        LRegion->removeLiveOut(Reg);
+      }
+      WasLiveOut = true;
+    }
+  }
+
+  if (WasLiveOut)
+    LRegion->addLiveOut(CombinedSourceReg);
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHI(RegionMRT *Region,
+                                                  MachineBasicBlock *LastMerge,
+                                                  MachineInstr &PHI,
+                                                  LinearizedRegion *LRegion) {
+  SmallVector<unsigned, 2> PHIRegionIndices;
+  getPHIRegionIndices(Region, PHI, PHIRegionIndices);
+  unsigned LinearizedSourceReg =
+      storePHILinearizationInfo(PHI, &PHIRegionIndices);
+
+  replacePHI(PHI, LinearizedSourceReg, LastMerge, PHIRegionIndices);
+  replaceLiveOutRegs(PHI, PHIRegionIndices, LinearizedSourceReg, LRegion);
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHI(LinearizedRegion *Region,
+                                                   MachineBasicBlock *IfMBB,
+                                                   MachineInstr &PHI) {
+  SmallVector<unsigned, 2> PHINonRegionIndices;
+  getPHINonRegionIndices(Region, PHI, PHINonRegionIndices);
+  unsigned LinearizedSourceReg =
+      storePHILinearizationInfo(PHI, &PHINonRegionIndices);
+  replaceEntryPHI(PHI, LinearizedSourceReg, IfMBB, PHINonRegionIndices);
+}
+
+static void collectPHIs(MachineBasicBlock *MBB,
+                        SmallVector<MachineInstr *, 2> &PHIs) {
+  for (auto &BBI : *MBB) {
+    if (BBI.isPHI()) {
+      PHIs.push_back(&BBI);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHIs(RegionMRT *Region,
+                                                   MachineBasicBlock *LastMerge,
+                                                   LinearizedRegion *LRegion) {
+  SmallVector<MachineInstr *, 2> PHIs;
+  auto Exit = Region->getSucc();
+  if (Exit == nullptr)
+    return;
+
+  collectPHIs(Exit, PHIs);
+
+  for (auto PHII : PHIs) {
+    rewriteRegionExitPHI(Region, LastMerge, *PHII, LRegion);
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Region,
+                                                    MachineBasicBlock *IfMBB) {
+  SmallVector<MachineInstr *, 2> PHIs;
+  auto Entry = Region->getEntry();
+
+  collectPHIs(Entry, PHIs);
+
+  for (auto PHII : PHIs) {
+    rewriteRegionEntryPHI(Region, IfMBB, *PHII);
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB,
+                                                       MachineBasicBlock *Dest,
+                                                       const DebugLoc &DL) {
+  DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
+               << " -> " << Dest->getNumber() << "\n");
+  MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator();
+  bool HasTerminator = Terminator != MBB->instr_end();
+  if (HasTerminator) {
+    TII->ReplaceTailWithBranchTo(Terminator, Dest);
+  }
+  if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(Dest)) {
+    TII->insertUnconditionalBranch(*MBB, Dest, DL);
+  }
+}
+
+static MachineBasicBlock *getSingleExitNode(MachineFunction &MF) {
+  MachineBasicBlock *result = nullptr;
+  for (auto &MFI : MF) {
+    if (MFI.succ_size() == 0) {
+      if (result == nullptr) {
+        result = &MFI;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  return result;
+}
+
+static bool hasOneExitNode(MachineFunction &MF) {
+  return getSingleExitNode(MF) != nullptr;
+}
+
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) {
+  auto Exit = Region->getSucc();
+
+  // If the exit is the end of the function, we just use the existing
+  MachineFunction *MF = Region->getEntry()->getParent();
+  if (Exit == nullptr && hasOneExitNode(*MF)) {
+    return &(*(--(Region->getEntry()->getParent()->end())));
+  }
+
+  MachineBasicBlock *LastMerge = MF->CreateMachineBasicBlock();
+  if (Exit == nullptr) {
+    MachineFunction::iterator ExitIter = MF->end();
+    MF->insert(ExitIter, LastMerge);
+  } else {
+    MachineFunction::iterator ExitIter = Exit->getIterator();
+    MF->insert(ExitIter, LastMerge);
+    LastMerge->addSuccessor(Exit);
+    insertUnconditionalBranch(LastMerge, Exit);
+    DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n");
+  }
+  return LastMerge;
+}
+
+void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
+                                            MachineBasicBlock *CodeBB,
+                                            MachineBasicBlock *MergeBB,
+                                            unsigned DestRegister,
+                                            unsigned IfSourceRegister,
+                                            unsigned CodeSourceRegister,
+                                            bool IsUndefIfSource) {
+  // If this is the function exit block, we don't need a phi.
+  if (MergeBB->succ_begin() == MergeBB->succ_end()) {
+    return;
+  }
+  DEBUG(dbgs() << "Merge PHI (BB#" << MergeBB->getNumber()
+               << "): " << PrintReg(DestRegister, TRI) << "<def> = PHI("
+               << PrintReg(IfSourceRegister, TRI) << ", BB#"
+               << IfBB->getNumber() << PrintReg(CodeSourceRegister, TRI)
+               << ", BB#" << CodeBB->getNumber() << ")\n");
+  const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin());
+  MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL,
+                                    TII->get(TargetOpcode::PHI), DestRegister);
+  if (IsUndefIfSource && false) {
+    MIB.addReg(IfSourceRegister, RegState::Undef);
+  } else {
+    MIB.addReg(IfSourceRegister);
+  }
+  MIB.addMBB(IfBB);
+  MIB.addReg(CodeSourceRegister);
+  MIB.addMBB(CodeBB);
+}
+
+static void removeExternalCFGSuccessors(MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::succ_iterator PI = MBB->succ_begin(),
+                                        E = MBB->succ_end();
+       PI != E; ++PI) {
+    if ((*PI) != MBB) {
+      (MBB)->removeSuccessor(*PI);
+    }
+  }
+}
+
+static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
+                                   MachineBasicBlock *EndMBB) {
+
+  // We have to check against the StartMBB successor becasuse a
+  // structurized region with a loop will have the entry block split,
+  // and the backedge will go to the entry successor.
+  DenseSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>> Succs;
+  unsigned SuccSize = StartMBB->succ_size();
+  if (SuccSize > 0) {
+    MachineBasicBlock *StartMBBSucc = *(StartMBB->succ_begin());
+    for (MachineBasicBlock::succ_iterator PI = EndMBB->succ_begin(),
+                                          E = EndMBB->succ_end();
+         PI != E; ++PI) {
+      // Either we have a back-edge to the entry block, or a back-edge to the
+      // succesor of the entry block since the block may be split.
+      if ((*PI) != StartMBB &&
+          !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) {
+        Succs.insert(
+            std::pair<MachineBasicBlock *, MachineBasicBlock *>(EndMBB, *PI));
+      }
+    }
+  }
+
+  for (MachineBasicBlock::pred_iterator PI = StartMBB->pred_begin(),
+                                        E = StartMBB->pred_end();
+       PI != E; ++PI) {
+    if ((*PI) != EndMBB) {
+      Succs.insert(
+          std::pair<MachineBasicBlock *, MachineBasicBlock *>(*PI, StartMBB));
+    }
+  }
+
+  for (auto SI : Succs) {
+    std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI;
+    DEBUG(dbgs() << "Removing edge: BB#" << Edge.first->getNumber() << " -> BB#"
+                 << Edge.second->getNumber() << "\n");
+    Edge.first->removeSuccessor(Edge.second);
+  }
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
+    MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBBStart,
+    MachineBasicBlock *CodeBBEnd, MachineBasicBlock *SelectBB, unsigned IfReg,
+    bool InheritPreds) {
+  MachineFunction *MF = MergeBB->getParent();
+  MachineBasicBlock *IfBB = MF->CreateMachineBasicBlock();
+
+  if (InheritPreds) {
+    for (MachineBasicBlock::pred_iterator PI = CodeBBStart->pred_begin(),
+                                          E = CodeBBStart->pred_end();
+         PI != E; ++PI) {
+      if ((*PI) != CodeBBEnd) {
+        MachineBasicBlock *Pred = (*PI);
+        Pred->addSuccessor(IfBB);
+      }
+    }
+  }
+
+  removeExternalCFGEdges(CodeBBStart, CodeBBEnd);
+
+  auto CodeBBStartI = CodeBBStart->getIterator();
+  auto CodeBBEndI = CodeBBEnd->getIterator();
+  auto MergeIter = MergeBB->getIterator();
+  MF->insert(MergeIter, IfBB);
+  MF->splice(MergeIter, CodeBBStartI, ++CodeBBEndI);
+  IfBB->addSuccessor(MergeBB);
+  IfBB->addSuccessor(CodeBBStart);
+
+  DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
+  // Ensure that the MergeBB is a succesor of the CodeEndBB.
+  if (!CodeBBEnd->isSuccessor(MergeBB))
+    CodeBBEnd->addSuccessor(MergeBB);
+
+  DEBUG(dbgs() << "Moved MBB#" << CodeBBStart->getNumber() << " through MBB#"
+               << CodeBBEnd->getNumber() << "\n");
+
+  // If we have a single predecessor we can find a reasonable debug location
+  MachineBasicBlock *SinglePred =
+      CodeBBStart->pred_size() == 1 ? *(CodeBBStart->pred_begin()) : nullptr;
+  const DebugLoc &DL = SinglePred
+                    ? SinglePred->findDebugLoc(SinglePred->getFirstTerminator())
+                    : DebugLoc();
+
+  unsigned Reg =
+      TII->insertEQ(IfBB, IfBB->begin(), DL, IfReg,
+                    SelectBB->getNumber() /* CodeBBStart->getNumber() */);
+  if (&(*(IfBB->getParent()->begin())) == IfBB) {
+    TII->materializeImmediate(*IfBB, IfBB->begin(), DL, IfReg,
+                              CodeBBStart->getNumber());
+  }
+  MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
+  ArrayRef<MachineOperand> Cond(RegOp);
+  TII->insertBranch(*IfBB, MergeBB, CodeBBStart, Cond, DL);
+
+  return IfBB;
+}
+
+void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled(
+    SmallVector<MachineOperand, 1> Cond) {
+  if (Cond.size() != 1)
+    return;
+  if (!Cond[0].isReg())
+    return;
+
+  unsigned CondReg = Cond[0].getReg();
+  for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) {
+    (*UI).setIsKill(false);
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *CodeBB,
+                                                     MachineBasicBlock *MergeBB,
+                                                     unsigned BBSelectReg) {
+  MachineBasicBlock *TrueBB = nullptr;
+  MachineBasicBlock *FalseBB = nullptr;
+  SmallVector<MachineOperand, 1> Cond;
+  MachineBasicBlock *FallthroughBB = FallthroughMap[CodeBB];
+  TII->analyzeBranch(*CodeBB, TrueBB, FalseBB, Cond);
+
+  const DebugLoc &DL = CodeBB->findDebugLoc(CodeBB->getFirstTerminator());
+
+  if (FalseBB == nullptr && TrueBB == nullptr && FallthroughBB == nullptr) {
+    // This is an exit block, hence no successors. We will assign the
+    // bb select register to the entry block.
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              BBSelectReg,
+                              CodeBB->getParent()->begin()->getNumber());
+    insertUnconditionalBranch(CodeBB, MergeBB, DL);
+    return;
+  }
+
+  if (FalseBB == nullptr && TrueBB == nullptr) {
+    TrueBB = FallthroughBB;
+  } else if (TrueBB != nullptr) {
+    FalseBB =
+        (FallthroughBB && (FallthroughBB != TrueBB)) ? FallthroughBB : FalseBB;
+  }
+
+  if ((TrueBB != nullptr && FalseBB == nullptr) || (TrueBB == FalseBB)) {
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              BBSelectReg, TrueBB->getNumber());
+  } else {
+    const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg);
+    unsigned TrueBBReg = MRI->createVirtualRegister(RegClass);
+    unsigned FalseBBReg = MRI->createVirtualRegister(RegClass);
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              TrueBBReg, TrueBB->getNumber());
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              FalseBBReg, FalseBB->getNumber());
+    ensureCondIsNotKilled(Cond);
+    TII->insertVectorSelect(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                            BBSelectReg, Cond, TrueBBReg, FalseBBReg);
+  }
+
+  insertUnconditionalBranch(CodeBB, MergeBB, DL);
+}
+
+MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) {
+  if (MRI->def_begin(Reg) == MRI->def_end()) {
+    DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                 << " has NO defs\n");
+  } else if (!MRI->hasOneDef(Reg)) {
+    DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                 << " has multiple defs\n");
+    DEBUG(dbgs() << "DEFS BEGIN:\n");
+    for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) {
+      DEBUG(DI->getParent()->dump());
+    }
+    DEBUG(dbgs() << "DEFS END\n");
+  }
+
+  assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
+  return (*(MRI->def_begin(Reg))).getParent();
+}
+
+void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
+                                              MachineBasicBlock *CodeBB,
+                                              MachineBasicBlock *MergeBB,
+                                              LinearizedRegion *InnerRegion,
+                                              unsigned DestReg,
+                                              unsigned SourceReg) {
+  // In this function we know we are part of a chain already, so we need
+  // to add the registers to the existing chain, and rename the register
+  // inside the region.
+  bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit();
+  MachineInstr *DefInstr = getDefInstr(SourceReg);
+  if (DefInstr->isPHI() && DefInstr->getParent() == CodeBB && IsSingleBB) {
+    // Handle the case where the def is a PHI-def inside a basic
+    // block, then we only need to do renaming. Special care needs to
+    // be taken if the PHI-def is part of an existing chain, or if a
+    // new one needs to be created.
+    InnerRegion->replaceRegisterInsideRegion(SourceReg, DestReg, true, MRI);
+
+    // We collect all PHI Information, and if we are at the region entry,
+    // all PHIs will be removed, and then re-introduced if needed.
+    storePHILinearizationInfoDest(DestReg, *DefInstr);
+    // We have picked up all the information we need now and can remove
+    // the PHI
+    PHIInfo.removeSource(DestReg, SourceReg, CodeBB);
+    DefInstr->eraseFromParent();
+  } else {
+    // If this is not a phi-def, or it is a phi-def but from a linearized region
+    if (IsSingleBB && DefInstr->getParent() == InnerRegion->getEntry()) {
+      // If this is a single BB and the definition is in this block we
+      // need to replace any uses outside the region.
+      InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI);
+    }
+    const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
+    unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
+    bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
+    DEBUG(dbgs() << "Insert Chained PHI\n");
+    insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
+                   SourceReg, IsLastDef);
+
+    PHIInfo.removeSource(DestReg, SourceReg, CodeBB);
+    if (IsLastDef) {
+      const DebugLoc &DL = IfBB->findDebugLoc(IfBB->getFirstTerminator());
+      TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DL,
+                                NextDestReg, 0);
+      PHIInfo.deleteDef(DestReg);
+    } else {
+      PHIInfo.replaceDef(DestReg, NextDestReg);
+    }
+  }
+}
+
+bool AMDGPUMachineCFGStructurizer::containsDef(MachineBasicBlock *MBB,
+                                         LinearizedRegion *InnerRegion,
+                                         unsigned Register) {
+  return getDefInstr(Register)->getParent() == MBB ||
+         InnerRegion->contains(getDefInstr(Register)->getParent());
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
+                                                MachineBasicBlock *CodeBB,
+                                                MachineBasicBlock *MergeBB,
+                                                LinearizedRegion *InnerRegion,
+                                                LinearizedRegion *LRegion) {
+  DenseSet<unsigned> *LiveOuts = InnerRegion->getLiveOuts();
+  SmallVector<unsigned, 4> OldLiveOuts;
+  bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit();
+  for (auto OLI : *LiveOuts) {
+    OldLiveOuts.push_back(OLI);
+  }
+
+  for (auto LI : OldLiveOuts) {
+    DEBUG(dbgs() << "LiveOut: " << PrintReg(LI, TRI));
+    if (!containsDef(CodeBB, InnerRegion, LI) ||
+        (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) {
+      // If the register simly lives through the CodeBB, we don't have
+      // to rewrite anything since the register is not defined in this
+      // part of the code.
+      DEBUG(dbgs() << "- through");
+      continue;
+    }
+    DEBUG(dbgs() << "\n");
+    unsigned Reg = LI;
+    if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) {
+      // If the register is live out, we do want to create a phi,
+      // unless it is from the Exit block, becasuse in that case there
+      // is already a PHI, and no need to create a new one.
+
+      // If the register is just a live out def and not part of a phi
+      // chain, we need to create a PHI node to handle the if region,
+      // and replace all uses outside of the region with the new dest
+      // register, unless it is the outgoing BB select register. We have
+      // already creaed phi nodes for these.
+      const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
+      unsigned PHIDestReg = MRI->createVirtualRegister(RegClass);
+      unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
+      // Create initializer, this value is never used, but is needed
+      // to satisfy SSA.
+      DEBUG(dbgs() << "Initializer for reg: " << PrintReg(Reg) << "\n");
+      TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(),
+                        IfSourceReg, 0);
+
+      InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI);
+      DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
+      insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg,
+                     IfSourceReg, Reg, true);
+    }
+  }
+
+  // Handle the chained definitions in PHIInfo, checking if this basic block
+  // is a source block for a definition.
+  SmallVector<unsigned, 4> Sources;
+  if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) {
+    DEBUG(dbgs() << "Inserting PHI Live Out from BB#" << CodeBB->getNumber()
+                 << "\n");
+    for (auto SI : Sources) {
+      unsigned DestReg;
+      PHIInfo.findDest(SI, CodeBB, DestReg);
+      insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI);
+    }
+    DEBUG(dbgs() << "Insertion done.\n");
+  }
+
+  DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Before PHI Prune\n");
+  DEBUG(PHIInfo.dump(MRI));
+  SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4>
+      ElimiatedSources;
+  for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+       ++DRI) {
+
+    unsigned DestReg = *DRI;
+    auto SE = PHIInfo.sources_end(DestReg);
+
+    bool MBBContainsPHISource = false;
+    // Check if there is a PHI source in this MBB
+    for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+      unsigned SourceReg = (*SRI).first;
+      MachineOperand *Def = &(*(MRI->def_begin(SourceReg)));
+      if (Def->getParent()->getParent() == MBB) {
+        MBBContainsPHISource = true;
+      }
+    }
+
+    // If so, all other sources are useless since we know this block
+    // is always executed when the region is executed.
+    if (MBBContainsPHISource) {
+      for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+        PHILinearize::PHISourceT Source = *SRI;
+        unsigned SourceReg = Source.first;
+        MachineBasicBlock *SourceMBB = Source.second;
+        MachineOperand *Def = &(*(MRI->def_begin(SourceReg)));
+        if (Def->getParent()->getParent() != MBB) {
+          ElimiatedSources.push_back(
+              std::make_tuple(DestReg, SourceReg, SourceMBB));
+        }
+      }
+    }
+  }
+
+  // Remove the PHI sources that are in the given MBB
+  for (auto &SourceInfo : ElimiatedSources) {
+    PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo),
+                         std::get<2>(SourceInfo));
+  }
+  DEBUG(dbgs() << "After PHI Prune\n");
+  DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion,
+                                            unsigned DestReg) {
+  MachineBasicBlock *Entry = CurrentRegion->getEntry();
+  MachineBasicBlock *Exit = CurrentRegion->getExit();
+
+  DEBUG(dbgs() << "RegionExit: " << Exit->getNumber()
+               << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n");
+
+  int NumSources = 0;
+  auto SE = PHIInfo.sources_end(DestReg);
+
+  for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+    NumSources++;
+  }
+
+  if (NumSources == 1) {
+    auto SRI = PHIInfo.sources_begin(DestReg);
+    unsigned SourceReg = (*SRI).first;
+    replaceRegisterWith(DestReg, SourceReg);
+  } else {
+    const DebugLoc &DL = Entry->findDebugLoc(Entry->begin());
+    MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL,
+                                      TII->get(TargetOpcode::PHI), DestReg);
+    DEBUG(dbgs() << "Entry PHI " << PrintReg(DestReg, TRI) << "<def> = PHI(");
+
+    unsigned CurrentBackedgeReg = 0;
+
+    for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+      unsigned SourceReg = (*SRI).first;
+
+      if (CurrentRegion->contains((*SRI).second)) {
+        if (CurrentBackedgeReg == 0) {
+          CurrentBackedgeReg = SourceReg;
+        } else {
+          MachineInstr *PHIDefInstr = getDefInstr(SourceReg);
+          MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent();
+          const TargetRegisterClass *RegClass =
+              MRI->getRegClass(CurrentBackedgeReg);
+          unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass);
+          MachineInstrBuilder BackedgePHI =
+              BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL,
+                      TII->get(TargetOpcode::PHI), NewBackedgeReg);
+          BackedgePHI.addReg(CurrentBackedgeReg);
+          BackedgePHI.addMBB(getPHIPred(*PHIDefInstr, 0));
+          BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1));
+          BackedgePHI.addMBB((*SRI).second);
+          CurrentBackedgeReg = NewBackedgeReg;
+          DEBUG(dbgs() << "Inserting backedge PHI: "
+                       << PrintReg(NewBackedgeReg, TRI) << "<def> = PHI("
+                       << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
+                       << getPHIPred(*PHIDefInstr, 0)->getNumber() << ", "
+                       << PrintReg(getPHISourceReg(*PHIDefInstr, 1), TRI)
+                       << ", BB#" << (*SRI).second->getNumber());
+        }
+      } else {
+        MIB.addReg(SourceReg);
+        MIB.addMBB((*SRI).second);
+        DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                     << (*SRI).second->getNumber() << ", ");
+      }
+    }
+
+    // Add the final backedge register source to the entry phi
+    if (CurrentBackedgeReg != 0) {
+      MIB.addReg(CurrentBackedgeReg);
+      MIB.addMBB(Exit);
+      DEBUG(dbgs() << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
+                   << Exit->getNumber() << ")\n");
+    } else {
+      DEBUG(dbgs() << ")\n");
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) {
+  DEBUG(PHIInfo.dump(MRI));
+
+  for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+       ++DRI) {
+
+    unsigned DestReg = *DRI;
+    createEntryPHI(CurrentRegion, DestReg);
+  }
+  PHIInfo.clear();
+}
+
+void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
+                                                 unsigned NewRegister) {
+  assert(Register != NewRegister && "Cannot replace a reg with itself");
+
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
+                                         E = MRI->reg_end();
+       I != E;) {
+    MachineOperand &O = *I;
+    ++I;
+    if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+      DEBUG(dbgs() << "Trying to substitute physical register: "
+                   << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                   << "\n");
+      llvm_unreachable("Cannot substitute physical registers");
+      // We don't handle physical registers, but if we need to
+      // in the future This is how we do it:
+      // O.substPhysReg(NewRegister, *TRI);
+    } else {
+      DEBUG(dbgs() << "Replacing register: "
+                   << PrintReg(Register, MRI->getTargetRegisterInfo())
+                   << " with "
+                   << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                   << "\n");
+      O.setReg(NewRegister);
+    }
+  }
+  PHIInfo.deleteDef(Register);
+
+  getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
+
+  DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) {
+  DEBUG(dbgs() << "Resolve PHI Infos\n");
+  DEBUG(PHIInfo.dump(MRI));
+  for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+       ++DRI) {
+    unsigned DestReg = *DRI;
+    DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) << "\n");
+    auto SRI = PHIInfo.sources_begin(DestReg);
+    unsigned SourceReg = (*SRI).first;
+    DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI)
+                 << " SourceReg: " << PrintReg(SourceReg, TRI) << "\n");
+
+    assert(PHIInfo.sources_end(DestReg) == ++SRI &&
+           "More than one phi source in entry node");
+    replaceRegisterWith(DestReg, SourceReg);
+  }
+}
+
+static bool isFunctionEntryBlock(MachineBasicBlock *MBB) {
+  return ((&(*(MBB->getParent()->begin()))) == MBB);
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
+    MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBB,
+    LinearizedRegion *CurrentRegion, unsigned BBSelectRegIn,
+    unsigned BBSelectRegOut) {
+  if (isFunctionEntryBlock(CodeBB) && !CurrentRegion->getHasLoop()) {
+    // Handle non-loop function entry block.
+    // We need to allow loops to the entry block and then
+    rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut);
+    resolvePHIInfos(CodeBB);
+    removeExternalCFGSuccessors(CodeBB);
+    CodeBB->addSuccessor(MergeBB);
+    CurrentRegion->addMBB(CodeBB);
+    return nullptr;
+  }
+  if (CurrentRegion->getEntry() == CodeBB && !CurrentRegion->getHasLoop()) {
+    // Handle non-loop region entry block.
+    MachineFunction *MF = MergeBB->getParent();
+    auto MergeIter = MergeBB->getIterator();
+    auto CodeBBStartIter = CodeBB->getIterator();
+    auto CodeBBEndIter = ++(CodeBB->getIterator());
+    if (CodeBBEndIter != MergeIter) {
+      MF->splice(MergeIter, CodeBBStartIter, CodeBBEndIter);
+    }
+    rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut);
+    prunePHIInfo(CodeBB);
+    createEntryPHIs(CurrentRegion);
+    removeExternalCFGSuccessors(CodeBB);
+    CodeBB->addSuccessor(MergeBB);
+    CurrentRegion->addMBB(CodeBB);
+    return nullptr;
+  } else {
+    // Handle internal block.
+    const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn);
+    unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass);
+    rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg);
+    bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB;
+    MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB,
+                                            BBSelectRegIn, IsRegionEntryBB);
+    CurrentRegion->addMBB(IfBB);
+    // If this is the entry block we need to make the If block the new
+    // linearized region entry.
+    if (IsRegionEntryBB) {
+      CurrentRegion->setEntry(IfBB);
+
+      if (CurrentRegion->getHasLoop()) {
+        MachineBasicBlock *RegionExit = CurrentRegion->getExit();
+        MachineBasicBlock *ETrueBB = nullptr;
+        MachineBasicBlock *EFalseBB = nullptr;
+        SmallVector<MachineOperand, 1> ECond;
+
+        const DebugLoc &DL = DebugLoc();
+        TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond);
+        TII->removeBranch(*RegionExit);
+
+        // We need to create a backedge if there is a loop
+        unsigned Reg = TII->insertNE(
+            RegionExit, RegionExit->instr_end(), DL,
+            CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
+            CurrentRegion->getRegionMRT()->getEntry()->getNumber());
+        MachineOperand RegOp =
+            MachineOperand::CreateReg(Reg, false, false, true);
+        ArrayRef<MachineOperand> Cond(RegOp);
+        DEBUG(dbgs() << "RegionExitReg: ");
+        DEBUG(Cond[0].print(dbgs(), TRI));
+        DEBUG(dbgs() << "\n");
+        TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
+                          Cond, DebugLoc());
+        RegionExit->addSuccessor(CurrentRegion->getEntry());
+      }
+    }
+    CurrentRegion->addMBB(CodeBB);
+    LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo);
+
+    InnerRegion.setParent(CurrentRegion);
+    DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
+    insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
+                   CodeBBSelectReg);
+    InnerRegion.addMBB(MergeBB);
+
+    DEBUG(InnerRegion.print(dbgs(), TRI));
+    rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion);
+    extractKilledPHIs(CodeBB);
+    if (IsRegionEntryBB) {
+      createEntryPHIs(CurrentRegion);
+    }
+    return IfBB;
+  }
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
+    MachineBasicBlock *MergeBB, LinearizedRegion *InnerRegion,
+    LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB,
+    unsigned BBSelectRegIn, unsigned BBSelectRegOut) {
+  unsigned CodeBBSelectReg =
+      InnerRegion->getRegionMRT()->getInnerOutputRegister();
+  MachineBasicBlock *CodeEntryBB = InnerRegion->getEntry();
+  MachineBasicBlock *CodeExitBB = InnerRegion->getExit();
+  MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeEntryBB, CodeExitBB,
+                                          SelectBB, BBSelectRegIn, true);
+  CurrentRegion->addMBB(IfBB);
+  bool isEntry = CurrentRegion->getEntry() == InnerRegion->getEntry();
+  if (isEntry) {
+
+    if (CurrentRegion->getHasLoop()) {
+      MachineBasicBlock *RegionExit = CurrentRegion->getExit();
+      MachineBasicBlock *ETrueBB = nullptr;
+      MachineBasicBlock *EFalseBB = nullptr;
+      SmallVector<MachineOperand, 1> ECond;
+
+      const DebugLoc &DL = DebugLoc();
+      TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond);
+      TII->removeBranch(*RegionExit);
+
+      // We need to create a backedge if there is a loop
+      unsigned Reg =
+          TII->insertNE(RegionExit, RegionExit->instr_end(), DL,
+                        CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
+                        CurrentRegion->getRegionMRT()->getEntry()->getNumber());
+      MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
+      ArrayRef<MachineOperand> Cond(RegOp);
+      DEBUG(dbgs() << "RegionExitReg: ");
+      DEBUG(Cond[0].print(dbgs(), TRI));
+      DEBUG(dbgs() << "\n");
+      TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
+                        Cond, DebugLoc());
+      RegionExit->addSuccessor(IfBB);
+    }
+  }
+  CurrentRegion->addMBBs(InnerRegion);
+  DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
+  insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
+                 CodeBBSelectReg);
+
+  rewriteLiveOutRegs(IfBB, /* CodeEntryBB */ CodeExitBB, MergeBB, InnerRegion,
+                     CurrentRegion);
+
+  rewriteRegionEntryPHIs(InnerRegion, IfBB);
+
+  if (isEntry) {
+    CurrentRegion->setEntry(IfBB);
+  }
+
+  if (isEntry) {
+    createEntryPHIs(CurrentRegion);
+  }
+
+  return IfBB;
+}
+
+void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
+                                          MachineBasicBlock *Entry,
+                                          MachineBasicBlock *EntrySucc,
+                                          LinearizedRegion *LRegion) {
+  SmallVector<unsigned, 2> PHIRegionIndices;
+  getPHIRegionIndices(LRegion, PHI, PHIRegionIndices);
+
+  assert(PHIRegionIndices.size() == 1);
+
+  unsigned RegionIndex = PHIRegionIndices[0];
+  unsigned RegionSourceReg = getPHISourceReg(PHI, RegionIndex);
+  MachineBasicBlock *RegionSourceMBB = getPHIPred(PHI, RegionIndex);
+  unsigned PHIDest = getPHIDestReg(PHI);
+  unsigned PHISource = PHIDest;
+  unsigned ReplaceReg;
+
+  if (shrinkPHI(PHI, PHIRegionIndices, &ReplaceReg)) {
+    PHISource = ReplaceReg;
+  }
+
+  const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest);
+  unsigned NewDestReg = MRI->createVirtualRegister(RegClass);
+  LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI);
+  MachineInstrBuilder MIB =
+      BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
+              TII->get(TargetOpcode::PHI), NewDestReg);
+  DEBUG(dbgs() << "Split Entry PHI " << PrintReg(NewDestReg, TRI)
+               << "<def> = PHI(");
+  MIB.addReg(PHISource);
+  MIB.addMBB(Entry);
+  DEBUG(dbgs() << PrintReg(PHISource, TRI) << ", BB#" << Entry->getNumber());
+  MIB.addReg(RegionSourceReg);
+  MIB.addMBB(RegionSourceMBB);
+  DEBUG(dbgs() << " ," << PrintReg(RegionSourceReg, TRI) << ", BB#"
+               << RegionSourceMBB->getNumber() << ")\n");
+}
+
+void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry,
+                                           MachineBasicBlock *EntrySucc,
+                                           LinearizedRegion *LRegion) {
+  SmallVector<MachineInstr *, 2> PHIs;
+  collectPHIs(Entry, PHIs);
+
+  for (auto PHII : PHIs) {
+    splitLoopPHI(*PHII, Entry, EntrySucc, LRegion);
+  }
+}
+
+// Split the exit block so that we can insert a end control flow
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) {
+  auto MRTRegion = LRegion->getRegionMRT();
+  auto Exit = LRegion->getExit();
+  auto MF = Exit->getParent();
+  auto Succ = MRTRegion->getSucc();
+
+  auto NewExit = MF->CreateMachineBasicBlock();
+  auto AfterExitIter = Exit->getIterator();
+  AfterExitIter++;
+  MF->insert(AfterExitIter, NewExit);
+  Exit->removeSuccessor(Succ);
+  Exit->addSuccessor(NewExit);
+  NewExit->addSuccessor(Succ);
+  insertUnconditionalBranch(NewExit, Succ);
+  LRegion->addMBB(NewExit);
+  LRegion->setExit(NewExit);
+
+  DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n");
+
+  // Replace any PHI Predecessors in the successor with NewExit
+  for (auto &II : *Succ) {
+    MachineInstr &Instr = II;
+
+    // If we are past the PHI instructions we are done
+    if (!Instr.isPHI())
+      break;
+
+    int numPreds = getPHINumInputs(Instr);
+    for (int i = 0; i < numPreds; ++i) {
+      auto Pred = getPHIPred(Instr, i);
+      if (Pred == Exit) {
+        setPhiPred(Instr, i, NewExit);
+      }
+    }
+  }
+
+  return NewExit;
+}
+
+
+static MachineBasicBlock *split(MachineBasicBlock::iterator I) {
+  // Create the fall-through block.
+  MachineBasicBlock *MBB = (*I).getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock();
+  auto MBBIter = ++(MBB->getIterator());
+  MF->insert(MBBIter, SuccMBB);
+  SuccMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  MBB->addSuccessor(SuccMBB);
+
+  // Splice the code over.
+  SuccMBB->splice(SuccMBB->end(), MBB, I, MBB->end());
+
+  return SuccMBB;
+}
+
+// Split the entry block separating PHI-nodes and the rest of the code
+// This is needed to insert an initializer for the bb select register
+// inloop regions.
+
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) {
+  MachineBasicBlock *Entry = LRegion->getEntry();
+  MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI());
+  MachineBasicBlock *Exit = LRegion->getExit();
+
+  DEBUG(dbgs() << "Split BB#" << Entry->getNumber() << " to BB#"
+               << Entry->getNumber() << " -> BB#" << EntrySucc->getNumber()
+               << "\n");
+  LRegion->addMBB(EntrySucc);
+
+  // Make the backedge go to Entry Succ
+  if (Exit->isSuccessor(Entry)) {
+    Exit->removeSuccessor(Entry);
+  }
+  Exit->addSuccessor(EntrySucc);
+  MachineInstr &Branch = *(Exit->instr_rbegin());
+  for (auto &UI : Branch.uses()) {
+    if (UI.isMBB() && UI.getMBB() == Entry) {
+      UI.setMBB(EntrySucc);
+    }
+  }
+
+  splitLoopPHIs(Entry, EntrySucc, LRegion);
+
+  return EntrySucc;
+}
+
+LinearizedRegion *
+AMDGPUMachineCFGStructurizer::initLinearizedRegion(RegionMRT *Region) {
+  LinearizedRegion *LRegion = Region->getLinearizedRegion();
+  LRegion->initLiveOut(Region, MRI, TRI, PHIInfo);
+  LRegion->setEntry(Region->getEntry());
+  return LRegion;
+}
+
+static void removeOldExitPreds(RegionMRT *Region) {
+  MachineBasicBlock *Exit = Region->getSucc();
+  if (Exit == nullptr) {
+    return;
+  }
+  for (MachineBasicBlock::pred_iterator PI = Exit->pred_begin(),
+                                        E = Exit->pred_end();
+       PI != E; ++PI) {
+    if (Region->contains(*PI)) {
+      (*PI)->removeSuccessor(Exit);
+    }
+  }
+}
+
+static bool mbbHasBackEdge(MachineBasicBlock *MBB,
+                           SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
+  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
+    if (MBBs.count(*SI) != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool containsNewBackedge(MRT *Tree,
+                                SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
+  // Need to traverse this in reverse since it is in post order.
+  if (Tree == nullptr)
+    return false;
+
+  if (Tree->isMBB()) {
+    MachineBasicBlock *MBB = Tree->getMBBMRT()->getMBB();
+    MBBs.insert(MBB);
+    if (mbbHasBackEdge(MBB, MBBs)) {
+      return true;
+    }
+  } else {
+    RegionMRT *Region = Tree->getRegionMRT();
+    SetVector<MRT *> *Children = Region->getChildren();
+    for (auto CI = Children->rbegin(), CE = Children->rend(); CI != CE; ++CI) {
+      if (containsNewBackedge(*CI, MBBs))
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool containsNewBackedge(RegionMRT *Region) {
+  SmallPtrSet<MachineBasicBlock *, 8> MBBs;
+  return containsNewBackedge(Region, MBBs);
+}
+
+bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
+  auto *LRegion = initLinearizedRegion(Region);
+  LRegion->setHasLoop(containsNewBackedge(Region));
+  MachineBasicBlock *LastMerge = createLinearizedExitBlock(Region);
+  MachineBasicBlock *CurrentMerge = LastMerge;
+  LRegion->addMBB(LastMerge);
+  LRegion->setExit(LastMerge);
+
+  rewriteRegionExitPHIs(Region, LastMerge, LRegion);
+  removeOldExitPreds(Region);
+
+  DEBUG(PHIInfo.dump(MRI));
+
+  SetVector<MRT *> *Children = Region->getChildren();
+  DEBUG(dbgs() << "===========If Region Start===============\n");
+  if (LRegion->getHasLoop()) {
+    DEBUG(dbgs() << "Has Backedge: Yes\n");
+  } else {
+    DEBUG(dbgs() << "Has Backedge: No\n");
+  }
+
+  unsigned BBSelectRegIn;
+  unsigned BBSelectRegOut;
+  for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) {
+    DEBUG(dbgs() << "CurrentRegion: \n");
+    DEBUG(LRegion->print(dbgs(), TRI));
+
+    auto CNI = CI;
+    ++CNI;
+
+    MRT *Child = (*CI);
+
+    if (Child->isRegion()) {
+
+      LinearizedRegion *InnerLRegion =
+          Child->getRegionMRT()->getLinearizedRegion();
+      // We found the block is the exit of an inner region, we need
+      // to put it in the current linearized region.
+
+      DEBUG(dbgs() << "Linearizing region: ");
+      DEBUG(InnerLRegion->print(dbgs(), TRI));
+      DEBUG(dbgs() << "\n");
+
+      MachineBasicBlock *InnerEntry = InnerLRegion->getEntry();
+      if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) {
+        // Entry has already been linearized, no need to do this region.
+        unsigned OuterSelect = InnerLRegion->getBBSelectRegOut();
+        unsigned InnerSelectReg =
+            InnerLRegion->getRegionMRT()->getInnerOutputRegister();
+        replaceRegisterWith(InnerSelectReg, OuterSelect),
+            resolvePHIInfos(InnerEntry);
+        if (!InnerLRegion->getExit()->isSuccessor(CurrentMerge))
+          InnerLRegion->getExit()->addSuccessor(CurrentMerge);
+        continue;
+      }
+
+      BBSelectRegOut = Child->getBBSelectRegOut();
+      BBSelectRegIn = Child->getBBSelectRegIn();
+
+      DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+                   << "\n");
+      DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+                   << "\n");
+
+      MachineBasicBlock *IfEnd = CurrentMerge;
+      CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion,
+                                    Child->getRegionMRT()->getEntry(),
+                                    BBSelectRegIn, BBSelectRegOut);
+      TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
+    } else {
+      MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB();
+      DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
+
+      if (MBB == getSingleExitNode(*(MBB->getParent()))) {
+        // If this is the exit block then we need to skip to the next.
+        // The "in" register will be transferred to "out" in the next
+        // iteration.
+        continue;
+      }
+
+      BBSelectRegOut = Child->getBBSelectRegOut();
+      BBSelectRegIn = Child->getBBSelectRegIn();
+
+      DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+                   << "\n");
+      DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+                   << "\n");
+
+      MachineBasicBlock *IfEnd = CurrentMerge;
+      // This is a basic block that is not part of an inner region, we
+      // need to put it in the current linearized region.
+      CurrentMerge = createIfRegion(CurrentMerge, MBB, LRegion, BBSelectRegIn,
+                                    BBSelectRegOut);
+      if (CurrentMerge) {
+        TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
+      }
+
+      DEBUG(PHIInfo.dump(MRI));
+    }
+  }
+
+  LRegion->removeFalseRegisterKills(MRI);
+
+  if (LRegion->getHasLoop()) {
+    MachineBasicBlock *NewSucc = splitEntry(LRegion);
+    if (isFunctionEntryBlock(LRegion->getEntry())) {
+      resolvePHIInfos(LRegion->getEntry());
+    }
+    const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI());
+    unsigned InReg = LRegion->getBBSelectRegIn();
+    unsigned InnerSelectReg =
+        MRI->createVirtualRegister(MRI->getRegClass(InReg));
+    unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
+    TII->materializeImmediate(*(LRegion->getEntry()),
+                              LRegion->getEntry()->getFirstTerminator(), DL,
+                              NewInReg, Region->getEntry()->getNumber());
+    // Need to be careful about updating the registers inside the region.
+    LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI);
+    DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
+    insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc,
+                   InnerSelectReg, NewInReg,
+                   LRegion->getRegionMRT()->getInnerOutputRegister());
+    splitExit(LRegion);
+    TII->convertNonUniformLoopRegion(NewSucc, LastMerge);
+  }
+
+  if (Region->isRoot()) {
+    TII->insertReturn(*LastMerge);
+  }
+
+  DEBUG(Region->getEntry()->getParent()->dump());
+  DEBUG(LRegion->print(dbgs(), TRI));
+  DEBUG(PHIInfo.dump(MRI));
+
+  DEBUG(dbgs() << "===========If Region End===============\n");
+
+  Region->setLinearizedRegion(LRegion);
+  return true;
+}
+
+bool AMDGPUMachineCFGStructurizer::structurizeRegion(RegionMRT *Region) {
+  if (false && regionIsSimpleIf(Region)) {
+    transformSimpleIfRegion(Region);
+    return true;
+  } else if (regionIsSequence(Region)) {
+    fixupRegionExits(Region);
+    return false;
+  } else {
+    structurizeComplexRegion(Region);
+  }
+  return false;
+}
+
+static int structurize_once = 0;
+
+bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region,
+                                                bool isTopRegion) {
+  bool Changed = false;
+
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (CI->isRegion()) {
+      Changed |= structurizeRegions(CI->getRegionMRT(), false);
+    }
+  }
+
+  if (structurize_once < 2 || true) {
+    Changed |= structurizeRegion(Region);
+    structurize_once++;
+  }
+  return Changed;
+}
+
+void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) {
+  DEBUG(dbgs() << "Fallthrough Map:\n");
+  for (auto &MBBI : MF) {
+    MachineBasicBlock *MBB = MBBI.getFallThrough();
+    if (MBB != nullptr) {
+      DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
+                   << MBB->getNumber() << "\n");
+    }
+    FallthroughMap[&MBBI] = MBB;
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region,
+                                                    unsigned SelectOut) {
+  LinearizedRegion *LRegion = new LinearizedRegion();
+  if (SelectOut) {
+    LRegion->addLiveOut(SelectOut);
+    DEBUG(dbgs() << "Add LiveOut (BBSelect): " << PrintReg(SelectOut, TRI)
+                 << "\n");
+  }
+  LRegion->setRegionMRT(Region);
+  Region->setLinearizedRegion(LRegion);
+  LRegion->setParent(Region->getParent()
+                         ? Region->getParent()->getLinearizedRegion()
+                         : nullptr);
+}
+
+unsigned
+AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned SelectOut,
+                                                  MachineRegisterInfo *MRI,
+                                                  const SIInstrInfo *TII) {
+  if (MRT->isRegion()) {
+    RegionMRT *Region = MRT->getRegionMRT();
+    Region->setBBSelectRegOut(SelectOut);
+    unsigned InnerSelectOut = createBBSelectReg(TII, MRI);
+
+    // Fixme: Move linearization creation to the original spot
+    createLinearizedRegion(Region, SelectOut);
+
+    for (auto CI = Region->getChildren()->begin(),
+              CE = Region->getChildren()->end();
+         CI != CE; ++CI) {
+      InnerSelectOut =
+          initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII);
+    }
+    MRT->setBBSelectRegIn(InnerSelectOut);
+    return InnerSelectOut;
+  } else {
+    MRT->setBBSelectRegOut(SelectOut);
+    unsigned NewSelectIn = createBBSelectReg(TII, MRI);
+    MRT->setBBSelectRegIn(NewSelectIn);
+    return NewSelectIn;
+  }
+}
+
+static void checkRegOnlyPHIInputs(MachineFunction &MF) {
+  for (auto &MBBI : MF) {
+    for (MachineBasicBlock::instr_iterator I = MBBI.instr_begin(),
+                                           E = MBBI.instr_end();
+         I != E; ++I) {
+      MachineInstr &Instr = *I;
+      if (Instr.isPHI()) {
+        int numPreds = getPHINumInputs(Instr);
+        for (int i = 0; i < numPreds; ++i) {
+          assert(Instr.getOperand(i * 2 + 1).isReg() &&
+                 "PHI Operand not a register");
+        }
+      }
+    }
+  }
+}
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+                      "AMDGPU Machine CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass)
+INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+                    "AMDGPU Machine CFG Structurizer", false, false)
+
+char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID;
+
+
+bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  MRI = &(MF.getRegInfo());
+  initFallthroughMap(MF);
+
+  checkRegOnlyPHIInputs(MF);
+  DEBUG(dbgs() << "----STRUCTURIZER START----\n");
+  DEBUG(MF.dump());
+
+  Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo());
+  DEBUG(Regions->dump());
+
+  RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI);
+  setRegionMRT(RTree);
+  initializeSelectRegisters(RTree, 0, MRI, TII);
+  DEBUG(RTree->dump(TRI));
+  bool result = structurizeRegions(RTree, true);
+  delete RTree;
+  DEBUG(dbgs() << "----STRUCTURIZER END----\n");
+  initFallthroughMap(MF);
+  return result;
+}
+
+FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() {
+  return new AMDGPUMachineCFGStructurizer();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 36dcc699d4ea..e40f39557747 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -397,14 +397,17 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 // instructions.
 static bool canVectorizeInst(Instruction *Inst, User *User) {
   switch (Inst->getOpcode()) {
-  case Instruction::Load:
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(Inst);
+    return !LI->isVolatile();
+  }
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
     return true;
   case Instruction::Store: {
     // Must be the stored pointer operand, not a stored value.
     StoreInst *SI = cast<StoreInst>(Inst);
-    return SI->getPointerOperand() == User;
+    return (SI->getPointerOperand() == User) && !SI->isVolatile();
   }
   default:
     return false;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 972c28579f7a..6e301b4ad527 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -125,6 +125,9 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasSDWA(false),
     HasDPP(false),
     FlatAddressSpace(false),
+    FlatInstOffsets(false),
+    FlatGlobalInsts(false),
+    FlatScratchInsts(false),
 
     R600ALUInst(false),
     CaymanISA(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index a5cda817ac11..bed7d326b3dd 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -145,6 +145,9 @@ protected:
   bool HasSDWA;
   bool HasDPP;
   bool FlatAddressSpace;
+  bool FlatInstOffsets;
+  bool FlatGlobalInsts;
+  bool FlatScratchInsts;
   bool R600ALUInst;
   bool CaymanISA;
   bool CFALUBug;
@@ -380,6 +383,18 @@ public:
     return FlatAddressSpace;
   }
 
+  bool hasFlatInstOffsets() const {
+    return FlatInstOffsets;
+  }
+
+  bool hasFlatGlobalInsts() const {
+    return FlatGlobalInsts;
+  }
+
+  bool hasFlatScratchInsts() const {
+    return FlatScratchInsts;
+  }
+
   bool isMesaKernel(const MachineFunction &MF) const {
     return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
   }
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index cd5bad04d0b3..386a88b0520f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -118,6 +118,13 @@ static cl::opt<bool> EnableSIInsertWaitcntsPass(
   cl::desc("Use new waitcnt insertion pass"),
   cl::init(false));
 
+// Option to run late CFG structurizer
+static cl::opt<bool> LateCFGStructurize(
+  "amdgpu-late-structurize",
+  cl::desc("Enable late CFG structurization"),
+  cl::init(false),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -702,11 +709,15 @@ bool GCNPassConfig::addPreISel() {
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
   addPass(&AMDGPUUnifyDivergentExitNodesID);
-  addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+  if (!LateCFGStructurize) {
+    addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+  }
   addPass(createSinkingPass());
   addPass(createSITypeRewriter());
   addPass(createAMDGPUAnnotateUniformValues());
-  addPass(createSIAnnotateControlFlowPass());
+  if (!LateCFGStructurize) {
+    addPass(createSIAnnotateControlFlowPass());
+  }
 
   return false;
 }
@@ -770,6 +781,9 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
 #endif
 
 void GCNPassConfig::addPreRegAlloc() {
+  if (LateCFGStructurize) {
+    addPass(createAMDGPUMachineCFGStructurizerPass());
+  }
   addPass(createSIWholeQuadModePass());
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c9482c37ec80..beafebc1284a 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -363,13 +363,22 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                       unsigned Index) {
   switch (Opcode) {
   case Instruction::ExtractElement:
-  case Instruction::InsertElement:
+  case Instruction::InsertElement: {
+    unsigned EltSize
+      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
+    if (EltSize < 32) {
+      if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
+        return 0;
+      return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+    }
+
     // Extracts are just reads of a subregister, so are free. Inserts are
     // considered free because we don't want to have any cost for scalarizing
     // operations, and we don't have to copy into a different register class.
 
     // Dynamic indexing isn't free and is best avoided.
     return Index == ~0u ? 2 : 0;
+  }
   default:
     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
   }
@@ -479,3 +488,26 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
 
   return false;
 }
+
+unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                       Type *SubTp) {
+  if (ST->hasVOP3PInsts()) {
+    VectorType *VT = cast<VectorType>(Tp);
+    if (VT->getNumElements() == 2 &&
+        DL.getTypeSizeInBits(VT->getElementType()) == 16) {
+      // With op_sel VOP3P instructions freely can access the low half or high
+      // half of a register, so any swizzle is free.
+
+      switch (Kind) {
+      case TTI::SK_Broadcast:
+      case TTI::SK_Reverse:
+      case TTI::SK_PermuteSingleSrc:
+        return 0;
+      default:
+        break;
+      }
+    }
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 71d6306bc1a5..e0024e21e82b 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -114,6 +114,9 @@ public:
   }
 
   unsigned getVectorSplitCost() { return 0; }
+
+  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                          Type *SubTp);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 7c0ef4aeac3c..cafce0164fa9 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -48,6 +48,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUISelDAGToDAG.cpp
   AMDGPULowerIntrinsics.cpp
   AMDGPUMCInstLower.cpp
+  AMDGPUMachineCFGStructurizer.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUUnifyMetadata.cpp
   AMDGPUOpenCLImageTypeLoweringPass.cpp
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index b0ac0e689a0b..8ba9efd42c70 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">;
+def FLATAtomic : ComplexPattern<i64, 2, "SelectFlat">;
 
 //===----------------------------------------------------------------------===//
 // FLAT classes
@@ -62,7 +62,9 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   bits<8> vdst;
   bits<1> slc;
   bits<1> glc;
-  bits<1> tfe;
+
+  // We don't use tfe right now, and it was removed in gfx9.
+  bits<1> tfe = 0;
 
   // 15-0 is reserved.
   let Inst{16}    = !if(ps.has_glc, glc, ps.glcValue);
@@ -79,8 +81,8 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
 class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
   opName,
   (outs regClass:$vdst),
-  (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe),
-  " $vdst, $vaddr$glc$slc$tfe"> {
+  (ins VReg_64:$vaddr, GLC:$glc, slc:$slc),
+  " $vdst, $vaddr$glc$slc"> {
   let has_data = 0;
   let mayLoad = 1;
 }
@@ -88,8 +90,8 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
 class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo<
   opName,
   (outs),
-  (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe),
-  " $vaddr, $vdata$glc$slc$tfe"> {
+  (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc),
+  " $vaddr, $vdata$glc$slc"> {
   let mayLoad  = 0;
   let mayStore = 1;
   let has_vdst = 0;
@@ -105,8 +107,8 @@ multiclass FLAT_Atomic_Pseudo<
 
   def "" : FLAT_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
-    " $vaddr, $vdata$slc$tfe",
+    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc),
+    " $vaddr, $vdata$slc",
     []>,
     AtomicNoRet <NAME, 0> {
     let mayLoad = 1;
@@ -119,10 +121,10 @@ multiclass FLAT_Atomic_Pseudo<
 
   def _RTN : FLAT_Pseudo <opName,
     (outs vdst_rc:$vdst),
-    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
-    " $vdst, $vaddr, $vdata glc$slc$tfe",
+    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc),
+    " $vdst, $vaddr, $vdata glc$slc",
     [(set vt:$vdst,
-      (atomic (FLATAtomic i64:$vaddr, i1:$slc, i1:$tfe), data_vt:$vdata))]>,
+      (atomic (FLATAtomic i64:$vaddr, i1:$slc), data_vt:$vdata))]>,
     AtomicNoRet <NAME, 1> {
     let mayLoad  = 1;
     let mayStore = 1;
@@ -311,30 +313,30 @@ def flat_truncstorei16 : flat_st <truncstorei16>;
 // Patterns for global loads with no offset.
 class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
   (vt (node i64:$addr)),
-  (inst $addr, 0, 0, 0)
+  (inst $addr, 0, 0)
 >;
 
 class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
   (vt (node i64:$addr)),
-  (inst $addr, 1, 0, 0)
+  (inst $addr, 1, 0)
 >;
 
 class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
   (node vt:$data, i64:$addr),
-  (inst $addr, $data, 0, 0, 0)
+  (inst $addr, $data, 0, 0)
 >;
 
 class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
   // atomic store follows atomic binop convention so the address comes
   // first.
   (node i64:$addr, vt:$data),
-  (inst $addr, $data, 1, 0, 0)
+  (inst $addr, $data, 1, 0)
 >;
 
 class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
                      ValueType data_vt = vt> : Pat <
   (vt (node i64:$addr, data_vt:$data)),
-  (inst $addr, $data, 0, 0)
+  (inst $addr, $data, 0)
 >;
 
 let Predicates = [isCIVI] in {
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index bf16a8216001..8066428fe44a 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -27,7 +27,7 @@ void llvm::printLivesAt(SlotIndex SI,
   unsigned Num = 0;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
     const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
-    if (MRI.reg_nodbg_empty(Reg))
+    if (!LIS.hasInterval(Reg))
       continue;
     const auto &LI = LIS.getInterval(Reg);
     if (LI.hasSubRanges()) {
@@ -131,13 +131,13 @@ bool GCNRegPressure::less(const SISubtarget &ST,
                           const GCNRegPressure& O,
                           unsigned MaxOccupancy) const {
   const auto SGPROcc = std::min(MaxOccupancy,
-                                ST.getOccupancyWithNumSGPRs(getSGRPNum()));
+                                ST.getOccupancyWithNumSGPRs(getSGPRNum()));
   const auto VGPROcc = std::min(MaxOccupancy,
-                                ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+                                ST.getOccupancyWithNumVGPRs(getVGPRNum()));
   const auto OtherSGPROcc = std::min(MaxOccupancy,
-                                ST.getOccupancyWithNumSGPRs(O.getSGRPNum()));
+                                ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
   const auto OtherVGPROcc = std::min(MaxOccupancy,
-                                ST.getOccupancyWithNumVGPRs(O.getVGRPNum()));
+                                ST.getOccupancyWithNumVGPRs(O.getVGPRNum()));
 
   const auto Occ = std::min(SGPROcc, VGPROcc);
   const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@@ -167,17 +167,17 @@ bool GCNRegPressure::less(const SISubtarget &ST,
         return VW < OtherVW;
     }
   }
-  return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()):
-                         (getVGRPNum() < O.getVGRPNum());
+  return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
+                         (getVGPRNum() < O.getVGPRNum());
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
 void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
-  OS << "VGPRs: " << getVGRPNum();
-  if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')';
-  OS << ", SGPRs: " << getSGRPNum();
-  if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')';
+  OS << "VGPRs: " << getVGPRNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
+  OS << ", SGPRs: " << getSGPRNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')';
   OS << ", LVGPR WT: " << getVGPRTuplesWeight()
      << ", LSGPR WT: " << getSGPRTuplesWeight();
   if (ST) OS << " -> Occ: " << getOccupancy(*ST);
@@ -192,7 +192,6 @@ LaneBitmask llvm::getLiveLaneMask(unsigned Reg,
                                   SlotIndex SI,
                                   const LiveIntervals &LIS,
                                   const MachineRegisterInfo &MRI) {
-  assert(!MRI.reg_nodbg_empty(Reg));
   LaneBitmask LiveMask;
   const auto &LI = LIS.getInterval(Reg);
   if (LI.hasSubRanges()) {
@@ -214,7 +213,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
   GCNRPTracker::LiveRegSet LiveRegs;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
     auto Reg = TargetRegisterInfo::index2VirtReg(I);
-    if (MRI.reg_nodbg_empty(Reg))
+    if (!LIS.hasInterval(Reg))
       continue;
     auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
     if (LiveMask.any())
@@ -223,13 +222,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
   return LiveRegs;
 }
 
-void GCNUpwardRPTracker::reset(const MachineInstr &MI) {
-  MRI = &MI.getParent()->getParent()->getRegInfo();
-  LiveRegs = getLiveRegsAfter(MI, LIS);
-  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
-}
-
-LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
+LaneBitmask GCNRPTracker::getDefRegMask(const MachineOperand &MO) const {
   assert(MO.isDef() && MO.isReg() &&
     TargetRegisterInfo::isVirtualRegister(MO.getReg()));
 
@@ -241,7 +234,7 @@ LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
     MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
 }
 
-LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
+LaneBitmask GCNRPTracker::getUsedRegMask(const MachineOperand &MO) const {
   assert(MO.isUse() && MO.isReg() &&
          TargetRegisterInfo::isVirtualRegister(MO.getReg()));
 
@@ -259,6 +252,18 @@ LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
   return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI);
 }
 
+void GCNUpwardRPTracker::reset(const MachineInstr &MI,
+                               const LiveRegSet *LiveRegsCopy) {
+  MRI = &MI.getParent()->getParent()->getRegInfo();
+  if (LiveRegsCopy) {
+    if (&LiveRegs != LiveRegsCopy)
+      LiveRegs = *LiveRegsCopy;
+  } else {
+    LiveRegs = getLiveRegsAfter(MI, LIS);
+  }
+  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+}
+
 void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   assert(MRI && "call reset first");
 
@@ -297,6 +302,100 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   MaxPressure = max(MaxPressure, CurPressure);
 }
 
+bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
+                                 const LiveRegSet *LiveRegsCopy) {
+  MRI = &MI.getParent()->getParent()->getRegInfo();
+  LastTrackedMI = nullptr;
+  MBBEnd = MI.getParent()->end();
+  NextMI = &MI;
+  NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+  if (NextMI == MBBEnd)
+    return false;
+  if (LiveRegsCopy) {
+    if (&LiveRegs != LiveRegsCopy)
+      LiveRegs = *LiveRegsCopy;
+  } else {
+    LiveRegs = getLiveRegsBefore(*NextMI, LIS);
+  }
+  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+  return true;
+}
+
+bool GCNDownwardRPTracker::advanceBeforeNext() {
+  assert(MRI && "call reset first");
+
+  NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+  if (NextMI == MBBEnd)
+    return false;
+
+  SlotIndex SI = LIS.getInstructionIndex(*NextMI).getBaseIndex();
+  assert(SI.isValid());
+
+  // Remove dead registers or mask bits.
+  for (auto &It : LiveRegs) {
+    const LiveInterval &LI = LIS.getInterval(It.first);
+    if (LI.hasSubRanges()) {
+      for (const auto &S : LI.subranges()) {
+        if (!S.liveAt(SI)) {
+          auto PrevMask = It.second;
+          It.second &= ~S.LaneMask;
+          CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+        }
+      }
+    } else if (!LI.liveAt(SI)) {
+      auto PrevMask = It.second;
+      It.second = LaneBitmask::getNone();
+      CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+    }
+    if (It.second.none())
+      LiveRegs.erase(It.first);
+  }
+
+  MaxPressure = max(MaxPressure, CurPressure);
+
+  return true;
+}
+
+void GCNDownwardRPTracker::advanceToNext() {
+  LastTrackedMI = &*NextMI++;
+
+  // Add new registers or mask bits.
+  for (const auto &MO : LastTrackedMI->defs()) {
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    auto &LiveMask = LiveRegs[Reg];
+    auto PrevMask = LiveMask;
+    LiveMask |= getDefRegMask(MO);
+    CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+  }
+
+  MaxPressure = max(MaxPressure, CurPressure);
+}
+
+bool GCNDownwardRPTracker::advance() {
+  // If we have just called reset live set is actual.
+  if ((NextMI == MBBEnd) || (LastTrackedMI && !advanceBeforeNext()))
+    return false;
+  advanceToNext();
+  return true;
+}
+
+bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator End) {
+  while (NextMI != End)
+    if (!advance()) return false;
+  return true;
+}
+
+bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin,
+                                   MachineBasicBlock::const_iterator End,
+                                   const LiveRegSet *LiveRegsCopy) {
+  reset(*Begin, LiveRegsCopy);
+  return advance(End);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
 static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
@@ -352,4 +451,16 @@ bool GCNUpwardRPTracker::isValid() const {
   return true;
 }
 
+void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
+                                 const MachineRegisterInfo &MRI) {
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    auto It = LiveRegs.find(Reg);
+    if (It != LiveRegs.end() && It->second.any())
+      OS << ' ' << PrintVRegOrUnit(Reg, TRI) << ':'
+         << PrintLaneMask(It->second);
+  }
+  OS << '\n';
+}
 #endif
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index 82e76a7bfddc..9875ca6a6d16 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -33,19 +33,19 @@ struct GCNRegPressure {
     clear();
   }
 
-  bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; }
+  bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; }
 
   void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
 
-  unsigned getSGRPNum() const { return Value[SGPR32]; }
-  unsigned getVGRPNum() const { return Value[VGPR32]; }
+  unsigned getSGPRNum() const { return Value[SGPR32]; }
+  unsigned getVGPRNum() const { return Value[VGPR32]; }
 
   unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
   unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
 
   unsigned getOccupancy(const SISubtarget &ST) const {
-    return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()),
-                    ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+    return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
+                    ST.getOccupancyWithNumVGPRs(getVGPRNum()));
   }
 
   void inc(unsigned Reg,
@@ -92,16 +92,21 @@ public:
   typedef DenseMap<unsigned, LaneBitmask> LiveRegSet;
 
 protected:
+  const LiveIntervals &LIS;
   LiveRegSet LiveRegs;
   GCNRegPressure CurPressure, MaxPressure;
   const MachineInstr *LastTrackedMI = nullptr;
   mutable const MachineRegisterInfo *MRI = nullptr;
-  GCNRPTracker() {}
+  GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+  LaneBitmask getDefRegMask(const MachineOperand &MO) const;
+  LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
 public:
   // live regs for the current state
   const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
   const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
 
+  void clearMaxPressure() { MaxPressure.clear(); }
+
   // returns MaxPressure, resetting it
   decltype(MaxPressure) moveMaxPressure() {
     auto Res = MaxPressure;
@@ -111,17 +116,16 @@ public:
   decltype(LiveRegs) moveLiveRegs() {
     return std::move(LiveRegs);
   }
+  static void printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
+                            const MachineRegisterInfo &MRI);
 };
 
 class GCNUpwardRPTracker : public GCNRPTracker {
-  const LiveIntervals &LIS;
-  LaneBitmask getDefRegMask(const MachineOperand &MO) const;
-  LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
 public:
-  GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+  GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
   // reset tracker to the point just below MI
   // filling live regs upon this point using LIS
-  void reset(const MachineInstr &MI);
+  void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
 
   // move to the state just above the MI
   void recede(const MachineInstr &MI);
@@ -131,6 +135,41 @@ public:
   bool isValid() const;
 };
 
+class GCNDownwardRPTracker : public GCNRPTracker {
+  // Last position of reset or advanceBeforeNext
+  MachineBasicBlock::const_iterator NextMI;
+
+  MachineBasicBlock::const_iterator MBBEnd;
+
+public:
+  GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
+
+  const MachineBasicBlock::const_iterator getNext() const { return NextMI; }
+
+  // Reset tracker to the point before the MI
+  // filling live regs upon this point using LIS.
+  // Returns false if block is empty except debug values.
+  bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
+
+  // Move to the state right before the next MI. Returns false if reached
+  // end of the block.
+  bool advanceBeforeNext();
+
+  // Move to the state at the MI, advanceBeforeNext has to be called first.
+  void advanceToNext();
+
+  // Move to the state at the next MI. Returns false if reached end of block.
+  bool advance();
+
+  // Advance instructions until before End.
+  bool advance(MachineBasicBlock::const_iterator End);
+
+  // Reset to Begin and advance to End.
+  bool advance(MachineBasicBlock::const_iterator Begin,
+               MachineBasicBlock::const_iterator End,
+               const LiveRegSet *LiveRegsCopy = nullptr);
+};
+
 LaneBitmask getLiveLaneMask(unsigned Reg,
                             SlotIndex SI,
                             const LiveIntervals &LIS,
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 630442625aa3..8ec46665daf5 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -316,46 +316,57 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
   MFI(*MF.getInfo<SIMachineFunctionInfo>()),
   StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
                                                     *MF.getFunction())),
-  MinOccupancy(StartingOccupancy), Stage(0) {
+  MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
 
   DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
 }
 
 void GCNScheduleDAGMILive::schedule() {
+  if (Stage == 0) {
+    // Just record regions at the first pass.
+    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+    return;
+  }
+
   std::vector<MachineInstr*> Unsched;
   Unsched.reserve(NumRegionInstrs);
   for (auto &I : *this)
     Unsched.push_back(&I);
 
-  std::pair<unsigned, unsigned> PressureBefore;
+  GCNRegPressure PressureBefore;
   if (LIS) {
-    DEBUG(dbgs() << "Pressure before scheduling:\n");
-    discoverLiveIns();
-    PressureBefore = getRealRegPressure();
+    PressureBefore = Pressure[RegionIdx];
+
+    DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+          GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
+          dbgs() << "Region live-in pressure:  ";
+          llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
+          dbgs() << "Region register pressure: ";
+          PressureBefore.print(dbgs()));
   }
 
   ScheduleDAGMILive::schedule();
-  if (Stage == 0)
-    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
 
   if (!LIS)
     return;
 
   // Check the results of scheduling.
   GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
-  DEBUG(dbgs() << "Pressure after scheduling:\n");
   auto PressureAfter = getRealRegPressure();
-  LiveIns.clear();
 
-  if (PressureAfter.first <= S.SGPRCriticalLimit &&
-      PressureAfter.second <= S.VGPRCriticalLimit) {
+  DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs()));
+
+  if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
+      PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
+    Pressure[RegionIdx] = PressureAfter;
     DEBUG(dbgs() << "Pressure in desired limits, done.\n");
     return;
   }
-  unsigned WavesAfter = getMaxWaves(PressureAfter.first,
-                                    PressureAfter.second, MF);
-  unsigned WavesBefore = getMaxWaves(PressureBefore.first,
-                                      PressureBefore.second, MF);
+  unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(),
+                                    PressureAfter.getVGPRNum(), MF);
+  unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(),
+                                     PressureBefore.getVGPRNum(), MF);
   DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore <<
                   ", after " << WavesAfter << ".\n");
 
@@ -368,8 +379,10 @@ void GCNScheduleDAGMILive::schedule() {
                  << MinOccupancy << ".\n");
   }
 
-  if (WavesAfter >= WavesBefore)
+  if (WavesAfter >= WavesBefore) {
+    Pressure[RegionIdx] = PressureAfter;
     return;
+  }
 
   DEBUG(dbgs() << "Attempting to revert scheduling.\n");
   RegionEnd = RegionBegin;
@@ -398,166 +411,139 @@ void GCNScheduleDAGMILive::schedule() {
     DEBUG(dbgs() << "Scheduling " << *MI);
   }
   RegionBegin = Unsched.front()->getIterator();
-  if (Stage == 0)
-    Regions.back() = std::make_pair(RegionBegin, RegionEnd);
+  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
 
   placeDebugValues();
 }
 
-static inline void setMask(const MachineRegisterInfo &MRI,
-                           const SIRegisterInfo *SRI, unsigned Reg,
-                           LaneBitmask &PrevMask, LaneBitmask NewMask,
-                           unsigned &SGPRs, unsigned &VGPRs) {
-  int NewRegs = countPopulation(NewMask.getAsInteger()) -
-                countPopulation(PrevMask.getAsInteger());
-  if (SRI->isSGPRReg(MRI, Reg))
-    SGPRs += NewRegs;
-  if (SRI->isVGPR(MRI, Reg))
-    VGPRs += NewRegs;
-  assert ((int)SGPRs >= 0 && (int)VGPRs >= 0);
-  PrevMask = NewMask;
+GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
+  GCNDownwardRPTracker RPTracker(*LIS);
+  RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
+  return RPTracker.moveMaxPressure();
 }
 
-void GCNScheduleDAGMILive::discoverLiveIns() {
-  unsigned SGPRs = 0;
-  unsigned VGPRs = 0;
+void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
+  GCNDownwardRPTracker RPTracker(*LIS);
+
+  // If the block has the only successor then live-ins of that successor are
+  // live-outs of the current block. We can reuse calculated live set if the
+  // successor will be sent to scheduling past current block.
+  const MachineBasicBlock *OnlySucc = nullptr;
+  if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) {
+    SlotIndexes *Ind = LIS->getSlotIndexes();
+    if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin()))
+      OnlySucc = *MBB->succ_begin();
+  }
 
-  auto &MI = *begin()->getParent()->getFirstNonDebugInstr();
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
-  SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();
-  assert (SI.isValid());
-
-  DEBUG(dbgs() << "Region live-ins:");
-  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
-    if (MRI.reg_nodbg_empty(Reg))
-      continue;
-    const LiveInterval &LI = LIS->getInterval(Reg);
-    LaneBitmask LaneMask = LaneBitmask::getNone();
-    if (LI.hasSubRanges()) {
-      for (const auto &S : LI.subranges())
-        if (S.liveAt(SI))
-          LaneMask |= S.LaneMask;
-    } else if (LI.liveAt(SI)) {
-      LaneMask = MRI.getMaxLaneMaskForVReg(Reg);
-    }
+  // Scheduler sends regions from the end of the block upwards.
+  size_t CurRegion = RegionIdx;
+  for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
+    if (Regions[CurRegion].first->getParent() != MBB)
+      break;
+  --CurRegion;
+
+  auto I = MBB->begin();
+  auto LiveInIt = MBBLiveIns.find(MBB);
+  if (LiveInIt != MBBLiveIns.end()) {
+    auto LiveIn = std::move(LiveInIt->second);
+    RPTracker.reset(*MBB->begin(), &LiveIn);
+    MBBLiveIns.erase(LiveInIt);
+  } else {
+    I = Regions[CurRegion].first;
+    RPTracker.reset(*I);
+  }
 
-    if (LaneMask.any()) {
-      setMask(MRI, SRI, Reg, LiveIns[Reg], LaneMask, SGPRs, VGPRs);
+  for ( ; ; ) {
+    I = RPTracker.getNext();
 
-      DEBUG(dbgs() << ' ' << PrintVRegOrUnit(Reg, SRI) << ':'
-                   << PrintLaneMask(LiveIns[Reg]));
+    if (Regions[CurRegion].first == I) {
+      LiveIns[CurRegion] = RPTracker.getLiveRegs();
+      RPTracker.clearMaxPressure();
     }
-  }
 
-  LiveInPressure = std::make_pair(SGPRs, VGPRs);
+    if (Regions[CurRegion].second == I) {
+      Pressure[CurRegion] = RPTracker.moveMaxPressure();
+      if (CurRegion-- == RegionIdx)
+        break;
+    }
+    RPTracker.advanceToNext();
+    RPTracker.advanceBeforeNext();
+  }
 
-  DEBUG(dbgs() << "\nLive-in pressure:\nSGPR = " << SGPRs
-               << "\nVGPR = " << VGPRs << '\n');
+  if (OnlySucc) {
+    if (I != MBB->end()) {
+      RPTracker.advanceToNext();
+      RPTracker.advance(MBB->end());
+    }
+    RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs());
+    RPTracker.advanceBeforeNext();
+    MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
+  }
 }
 
-std::pair<unsigned, unsigned>
-GCNScheduleDAGMILive::getRealRegPressure() const {
-  unsigned SGPRs, MaxSGPRs, VGPRs, MaxVGPRs;
-  SGPRs = MaxSGPRs = LiveInPressure.first;
-  VGPRs = MaxVGPRs = LiveInPressure.second;
-
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
-  DenseMap<unsigned, LaneBitmask> LiveRegs(LiveIns);
+void GCNScheduleDAGMILive::finalizeSchedule() {
+  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+  DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
 
-  for (const MachineInstr &MI : *this) {
-    if (MI.isDebugValue())
-      continue;
-    SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();
-    assert (SI.isValid());
+  LiveIns.resize(Regions.size());
+  Pressure.resize(Regions.size());
 
-    // Remove dead registers or mask bits.
-    for (auto &It : LiveRegs) {
-      if (It.second.none())
-        continue;
-      const LiveInterval &LI = LIS->getInterval(It.first);
-      if (LI.hasSubRanges()) {
-        for (const auto &S : LI.subranges())
-          if (!S.liveAt(SI))
-            setMask(MRI, SRI, It.first, It.second, It.second & ~S.LaneMask,
-                    SGPRs, VGPRs);
-      } else if (!LI.liveAt(SI)) {
-        setMask(MRI, SRI, It.first, It.second, LaneBitmask::getNone(),
-                SGPRs, VGPRs);
-      }
-    }
+  do {
+    Stage++;
+    RegionIdx = 0;
+    MachineBasicBlock *MBB = nullptr;
 
-    // Add new registers or mask bits.
-    for (const auto &MO : MI.defs()) {
-      if (!MO.isReg())
-        continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg))
-        continue;
-      unsigned SubRegIdx = MO.getSubReg();
-      LaneBitmask LaneMask = SubRegIdx != 0
-                             ? TRI->getSubRegIndexLaneMask(SubRegIdx)
-                             : MRI.getMaxLaneMaskForVReg(Reg);
-      LaneBitmask &LM = LiveRegs[Reg];
-      setMask(MRI, SRI, Reg, LM, LM | LaneMask, SGPRs, VGPRs);
-    }
-    MaxSGPRs = std::max(MaxSGPRs, SGPRs);
-    MaxVGPRs = std::max(MaxVGPRs, VGPRs);
-  }
+    if (Stage > 1) {
+      // Retry function scheduling if we found resulting occupancy and it is
+      // lower than used for first pass scheduling. This will give more freedom
+      // to schedule low register pressure blocks.
+      // Code is partially copied from MachineSchedulerBase::scheduleRegions().
 
-  DEBUG(dbgs() << "Real region's register pressure:\nSGPR = " << MaxSGPRs
-               << "\nVGPR = " << MaxVGPRs << '\n');
+      if (!LIS || StartingOccupancy <= MinOccupancy)
+        break;
 
-  return std::make_pair(MaxSGPRs, MaxVGPRs);
-}
+      DEBUG(dbgs()
+              << "Retrying function scheduling with lowest recorded occupancy "
+              << MinOccupancy << ".\n");
 
-void GCNScheduleDAGMILive::finalizeSchedule() {
-  // Retry function scheduling if we found resulting occupancy and it is
-  // lower than used for first pass scheduling. This will give more freedom
-  // to schedule low register pressure blocks.
-  // Code is partially copied from MachineSchedulerBase::scheduleRegions().
+      S.setTargetOccupancy(MinOccupancy);
+    }
 
-  if (!LIS || StartingOccupancy <= MinOccupancy)
-    return;
+    for (auto Region : Regions) {
+      RegionBegin = Region.first;
+      RegionEnd = Region.second;
 
-  DEBUG(dbgs() << "Retrying function scheduling with lowest recorded occupancy "
-               << MinOccupancy << ".\n");
+      if (RegionBegin->getParent() != MBB) {
+        if (MBB) finishBlock();
+        MBB = RegionBegin->getParent();
+        startBlock(MBB);
+        if (Stage == 1)
+          computeBlockPressure(MBB);
+      }
 
-  Stage++;
-  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
-  S.setTargetOccupancy(MinOccupancy);
+      unsigned NumRegionInstrs = std::distance(begin(), end());
+      enterRegion(MBB, begin(), end(), NumRegionInstrs);
 
-  MachineBasicBlock *MBB = nullptr;
-  for (auto Region : Regions) {
-    RegionBegin = Region.first;
-    RegionEnd = Region.second;
+      // Skip empty scheduling regions (0 or 1 schedulable instructions).
+      if (begin() == end() || begin() == std::prev(end())) {
+        exitRegion();
+        continue;
+      }
 
-    if (RegionBegin->getParent() != MBB) {
-      if (MBB) finishBlock();
-      MBB = RegionBegin->getParent();
-      startBlock(MBB);
-    }
+      DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      DEBUG(dbgs() << MF.getName()
+            << ":BB#" << MBB->getNumber() << " " << MBB->getName()
+            << "\n  From: " << *begin() << "    To: ";
+            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+            else dbgs() << "End";
+            dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
 
-    unsigned NumRegionInstrs = std::distance(begin(), end());
-    enterRegion(MBB, begin(), end(), NumRegionInstrs);
+      schedule();
 
-    // Skip empty scheduling regions (0 or 1 schedulable instructions).
-    if (begin() == end() || begin() == std::prev(end())) {
       exitRegion();
-      continue;
+      ++RegionIdx;
     }
-    DEBUG(dbgs() << "********** MI Scheduling **********\n");
-    DEBUG(dbgs() << MF.getName()
-          << ":BB#" << MBB->getNumber() << " " << MBB->getName()
-          << "\n  From: " << *begin() << "    To: ";
-          if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
-          else dbgs() << "End";
-          dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+    finishBlock();
 
-    schedule();
-
-    exitRegion();
-  }
-  finishBlock();
-  LiveIns.shrink_and_clear();
+  } while (Stage < 2);
 }
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 15af232704ff..3ed3cd5b3b1c 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
 #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
 
+#include "GCNRegPressure.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
 namespace llvm {
@@ -74,21 +75,28 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive {
   // Scheduling stage number.
   unsigned Stage;
 
+  // Current region index.
+  size_t RegionIdx;
+
   // Vecor of regions recorder for later rescheduling
   SmallVector<std::pair<MachineBasicBlock::iterator,
                         MachineBasicBlock::iterator>, 32> Regions;
 
-  // Region live-ins.
-  DenseMap<unsigned, LaneBitmask> LiveIns;
+  // Region live-in cache.
+  SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
+
+  // Region pressure cache.
+  SmallVector<GCNRegPressure, 32> Pressure;
+
+  // Temporary basic block live-in cache.
+  DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
 
-  // Number of live-ins to the current region, first SGPR then VGPR.
-  std::pair<unsigned, unsigned> LiveInPressure;
+  // Return current region pressure.
+  GCNRegPressure getRealRegPressure() const;
 
-  // Collect current region live-ins.
-  void discoverLiveIns();
+  // Compute and cache live-ins and pressure for all regions in block.
+  void computeBlockPressure(const MachineBasicBlock *MBB);
 
-  // Return current region pressure. First value is SGPR number, second is VGPR.
-  std::pair<unsigned, unsigned> getRealRegPressure() const;
 
 public:
   GCNScheduleDAGMILive(MachineSchedContext *C,
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index d8cb98fe1b19..8cb35c506135 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -126,7 +126,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   Void = Type::getVoidTy(Context);
   Boolean = Type::getInt1Ty(Context);
   Int64 = Type::getInt64Ty(Context);
-  ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
+  ReturnStruct = StructType::get(Boolean, Int64);
 
   BoolTrue = ConstantInt::getTrue(Context);
   BoolFalse = ConstantInt::getFalse(Context);
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index cc93c27731ff..48a14e4dbea2 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -488,6 +488,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FCANONICALIZE);
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -2003,6 +2004,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
       break;
     }
     assert(Found);
+    (void)Found;
 
     // This should be before all vector instructions.
     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
@@ -4604,6 +4606,24 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
   return SDValue();
 }
 
+SDValue SITargetLowering::performExtractVectorEltCombine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  SDValue Vec = N->getOperand(0);
+
+  SelectionDAG &DAG= DCI.DAG;
+  if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
+    SDLoc SL(N);
+    EVT EltVT = N->getValueType(0);
+    SDValue Idx = N->getOperand(1);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                              Vec.getOperand(0), Idx);
+    return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
+  }
+
+  return SDValue();
+}
+
+
 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                           const SDNode *N0,
                                           const SDNode *N1) const {
@@ -4891,6 +4911,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
+  case ISD::EXTRACT_VECTOR_ELT:
+    return performExtractVectorEltCombine(N, DCI);
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index d177777ad5ee..046e677756d1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -100,6 +100,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 92e452a3d6a0..065fd09eb356 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -496,6 +496,188 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
   return Opcode;
 }
 
+void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       const DebugLoc &DL, unsigned DestReg,
+                                       int64_t Value) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
+  if (RegClass == &AMDGPU::SReg_32RegClass ||
+      RegClass == &AMDGPU::SGPR_32RegClass ||
+      RegClass == &AMDGPU::SReg_32_XM0RegClass ||
+      RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  if (RegClass == &AMDGPU::SReg_64RegClass ||
+      RegClass == &AMDGPU::SGPR_64RegClass ||
+      RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  if (RegClass == &AMDGPU::VGPR_32RegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+      .addImm(Value);
+    return;
+  }
+  if (RegClass == &AMDGPU::VReg_64RegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  unsigned EltSize = 4;
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (RI.isSGPRClass(RegClass)) {
+    if (RI.getRegSizeInBits(*RegClass) > 32) {
+      Opcode =  AMDGPU::S_MOV_B64;
+      EltSize = 8;
+    } else {
+      Opcode = AMDGPU::S_MOV_B32;
+      EltSize = 4;
+    }
+  }
+
+  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
+  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+    int64_t IdxValue = Idx == 0 ? Value : 0;
+
+    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+      get(Opcode), RI.getSubReg(DestReg, Idx));
+    Builder.addImm(IdxValue);
+  }
+}
+
+const TargetRegisterClass *
+SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
+  return &AMDGPU::VGPR_32RegClass;
+}
+
+void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     const DebugLoc &DL, unsigned DstReg,
+                                     ArrayRef<MachineOperand> Cond,
+                                     unsigned TrueReg,
+                                     unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
+         "Not a VGPR32 reg");
+
+  if (Cond.size() == 1) {
+    BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+      .addReg(FalseReg)
+      .addReg(TrueReg)
+      .add(Cond[0]);
+  } else if (Cond.size() == 2) {
+    assert(Cond[0].isImm() && "Cond[0] is not an immediate");
+    switch (Cond[0].getImm()) {
+    case SIInstrInfo::SCC_TRUE: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(-1)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::SCC_FALSE: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(0)
+        .addImm(-1);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::VCCNZ: {
+      MachineOperand RegOp = Cond[1];
+      RegOp.setImplicit(false);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+          .addReg(FalseReg)
+          .addReg(TrueReg)
+          .add(RegOp);
+      break;
+    }
+    case SIInstrInfo::VCCZ: {
+      MachineOperand RegOp = Cond[1];
+      RegOp.setImplicit(false);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+          .addReg(TrueReg)
+          .addReg(FalseReg)
+          .add(RegOp);
+      break;
+    }
+    case SIInstrInfo::EXECNZ: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(-1)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::EXECZ: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(0)
+        .addImm(-1);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      llvm_unreachable("Unhandled branch predicate EXECZ");
+      break;
+    }
+    default:
+      llvm_unreachable("invalid branch predicate");
+    }
+  } else {
+    llvm_unreachable("Can only handle Cond size 1 or 2");
+  }
+}
+
+unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL,
+                               unsigned SrcReg, int Value) const {
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
+    .addImm(Value)
+    .addReg(SrcReg);
+
+  return Reg;
+}
+
+unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL,
+                               unsigned SrcReg, int Value) const {
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
+    .addImm(Value)
+    .addReg(SrcReg);
+
+  return Reg;
+}
+
 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
 
   if (RI.getRegSizeInBits(*DstRC) == 32) {
@@ -834,6 +1016,20 @@ void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
   insertWaitStates(MBB, MI, 1);
 }
 
+void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
+  auto MF = MBB.getParent();
+  SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+  assert(Info->isEntryFunction());
+
+  if (MBB.succ_empty()) {
+    bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
+    if (HasNoTerminator)
+      BuildMI(MBB, MBB.end(), DebugLoc(),
+              get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
+  }
+}
+
 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   default: return 1; // FIXME: Do wait states equal cycles?
@@ -1241,14 +1437,20 @@ bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
     return false;
   }
 
-  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
-  if (Pred == INVALID_BR)
-    return true;
+  MachineBasicBlock *CondBB = nullptr;
 
-  MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
-  Cond.push_back(MachineOperand::CreateImm(Pred));
-  Cond.push_back(I->getOperand(1)); // Save the branch register.
+  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+    CondBB = I->getOperand(1).getMBB();
+    Cond.push_back(I->getOperand(0));
+  } else {
+    BranchPredicate Pred = getBranchPredicate(I->getOpcode());
+    if (Pred == INVALID_BR)
+      return true;
 
+    CondBB = I->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(Pred));
+    Cond.push_back(I->getOperand(1)); // Save the branch register.
+  }
   ++I;
 
   if (I == MBB.end()) {
@@ -1351,6 +1553,13 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
     return 1;
   }
 
+  if(Cond.size() == 1 && Cond[0].isReg()) {
+     BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
+       .add(Cond[0])
+       .addMBB(TBB);
+     return 1;
+  }
+
   assert(TBB && Cond[0].isImm());
 
   unsigned Opcode
@@ -1390,9 +1599,16 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
 
 bool SIInstrInfo::reverseBranchCondition(
   SmallVectorImpl<MachineOperand> &Cond) const {
-  assert(Cond.size() == 2);
-  Cond[0].setImm(-Cond[0].getImm());
-  return false;
+  if (Cond.size() != 2) {
+    return true;
+  }
+
+  if (Cond[0].isImm()) {
+    Cond[0].setImm(-Cond[0].getImm());
+    return false;
+  }
+
+  return true;
 }
 
 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
@@ -3920,6 +4136,82 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
   return false;
 }
 
+bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
+  return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
+}
+
+void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+                                            MachineBasicBlock *IfEnd) const {
+  MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
+  assert(TI != IfEntry->end());
+
+  MachineInstr *Branch = &(*TI);
+  MachineFunction *MF = IfEntry->getParent();
+  MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
+
+  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstr *SIIF =
+        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
+            .add(Branch->getOperand(0))
+            .add(Branch->getOperand(1));
+    MachineInstr *SIEND =
+        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
+            .addReg(DstReg);
+
+    IfEntry->erase(TI);
+    IfEntry->insert(IfEntry->end(), SIIF);
+    IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
+  }
+}
+
+void SIInstrInfo::convertNonUniformLoopRegion(
+    MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
+  MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
+  // We expect 2 terminators, one conditional and one unconditional.
+  assert(TI != LoopEnd->end());
+
+  MachineInstr *Branch = &(*TI);
+  MachineFunction *MF = LoopEnd->getParent();
+  MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
+
+  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+
+    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstrBuilder HeaderPHIBuilder =
+        BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
+    for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
+                                          E = LoopEntry->pred_end();
+         PI != E; ++PI) {
+      if (*PI == LoopEnd) {
+        HeaderPHIBuilder.addReg(BackEdgeReg);
+      } else {
+        MachineBasicBlock *PMBB = *PI;
+        unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+        materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
+                             ZeroReg, 0);
+        HeaderPHIBuilder.addReg(ZeroReg);
+      }
+      HeaderPHIBuilder.addMBB(*PI);
+    }
+    MachineInstr *HeaderPhi = HeaderPHIBuilder;
+    MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
+                                      get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
+                                  .addReg(DstReg)
+                                  .add(Branch->getOperand(0));
+    MachineInstr *SILOOP =
+        BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
+            .addReg(BackEdgeReg)
+            .addMBB(LoopEntry);
+
+    LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
+    LoopEnd->erase(TI);
+    LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
+    LoopEnd->insert(LoopEnd->end(), SILOOP);
+  }
+}
+
 ArrayRef<std::pair<int, const char *>>
 SIInstrInfo::getSerializableTargetIndices() const {
   static const std::pair<int, const char *> TargetIndices[] = {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 03a5ef74b179..f6e5e8883f63 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -143,6 +143,23 @@ public:
                                     RegScavenger *RS, unsigned TmpReg,
                                     unsigned Offset, unsigned Size) const;
 
+  void materializeImmediate(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const DebugLoc &DL,
+                            unsigned DestReg,
+                            int64_t Value) const;
+
+  const TargetRegisterClass *getPreferredSelectRegClass(
+                               unsigned Size) const;
+
+  unsigned insertNE(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator I, const DebugLoc &DL,
+                    unsigned SrcReg, int Value)  const;
+
+  unsigned insertEQ(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator I, const DebugLoc &DL,
+                    unsigned SrcReg, int Value)  const;
+
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, unsigned SrcReg,
                            bool isKill, int FrameIndex,
@@ -193,7 +210,7 @@ public:
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify) const override;
+                     bool AllowModify = false) const override;
 
   unsigned removeBranch(MachineBasicBlock &MBB,
                         int *BytesRemoved = nullptr) const override;
@@ -218,6 +235,11 @@ public:
                     unsigned DstReg, ArrayRef<MachineOperand> Cond,
                     unsigned TrueReg, unsigned FalseReg) const override;
 
+  void insertVectorSelect(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I, const DebugLoc &DL,
+                          unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                          unsigned TrueReg, unsigned FalseReg) const;
+
   bool
   areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const override;
@@ -705,6 +727,7 @@ public:
   void insertNoop(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator MI) const override;
 
+  void insertReturn(MachineBasicBlock &MBB) const;
   /// \brief Return the number of wait states that result from executing this
   /// instruction.
   unsigned getNumWaitStates(const MachineInstr &MI) const;
@@ -750,6 +773,14 @@ public:
 
   bool mayAccessFlatAddressSpace(const MachineInstr &MI) const;
 
+  bool isNonUniformBranchInstr(MachineInstr &Instr) const;
+
+  void convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+                                 MachineBasicBlock *IfEnd) const;
+
+  void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry,
+                                   MachineBasicBlock *LoopEnd) const;
+
   ArrayRef<std::pair<int, const char *>>
   getSerializableTargetIndices() const override;
 
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 7ccb54f54e34..3b4bdc864253 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -174,6 +174,13 @@ def SI_MASK_BRANCH : VPseudoInstSI <
 
 let isTerminator = 1 in {
 
+ def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
+  (outs),
+  (ins SReg_64:$vcc, brtarget:$target),
+  [(brcond i1:$vcc, bb:$target)]> {
+    let Size = 12;
+}
+
 def SI_IF: CFPseudoInstSI <
   (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
   [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 2281f338ab45..4a11d9471f1d 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -164,8 +164,11 @@ multiclass VOP2eInst <string opName,
 class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
   field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
-  field string Asm32 = "$vdst, $src0, $src1, $imm";
   field bit HasExt = 0;
+
+  // Hack to stop printing _e64
+  let DstRC = RegisterOperand<VGPR_32>;
+  field string Asm32 = " $vdst, $src0, $src1, $imm";
 }
 
 def VOP_MADAK_F16 : VOP_MADAK <f16>;
@@ -174,8 +177,11 @@ def VOP_MADAK_F32 : VOP_MADAK <f32>;
 class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
   field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
-  field string Asm32 = "$vdst, $src0, $imm, $src1";
   field bit HasExt = 0;
+
+  // Hack to stop printing _e64
+  let DstRC = RegisterOperand<VGPR_32>;
+  field string Asm32 = " $vdst, $src0, $imm, $src1";
 }
 
 def VOP_MADMK_F16 : VOP_MADMK <f16>;
@@ -298,7 +304,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
 let SubtargetPredicate = isGCN in {
 
 defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
-def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>;
+def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">;
 
 let isCommutable = 1 in {
 defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
@@ -328,7 +334,7 @@ let Constraints = "$vdst = $src2", DisableEncoding="$src2",
 defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
 }
 
-def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>;
+def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">;
 
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
@@ -383,7 +389,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
 
 let SubtargetPredicate = isVI in {
 
-def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>;
+def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
 defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
 defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
 defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
@@ -394,7 +400,7 @@ defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
 defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
 defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
 defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
-def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>;
+def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
 defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
 defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
 defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
@@ -651,6 +657,17 @@ multiclass VOP2_Real_e64_vi <bits<10> op> {
     VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
 }
 
+multiclass VOP2_Real_e64only_vi <bits<10> op> {
+  def _e64_vi :
+    VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+    VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+      // Hack to stop printing _e64
+      VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64");
+      let OutOperandList = (outs VGPR_32:$vdst);
+      let AsmString = ps.Mnemonic # " " # ps.AsmOperands;
+    }
+}
+
 multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> {
   def _e64_vi :
     VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
@@ -718,17 +735,17 @@ defm V_SUBBREV_U32        : VOP2be_Real_e32e64_vi <0x1e>;
 defm V_READLANE_B32       : VOP32_Real_vi <0x289>;
 defm V_WRITELANE_B32      : VOP32_Real_vi <0x28a>;
 
-defm V_BFM_B32            : VOP2_Real_e64_vi <0x293>;
-defm V_BCNT_U32_B32       : VOP2_Real_e64_vi <0x28b>;
-defm V_MBCNT_LO_U32_B32   : VOP2_Real_e64_vi <0x28c>;
-defm V_MBCNT_HI_U32_B32   : VOP2_Real_e64_vi <0x28d>;
-defm V_LDEXP_F32          : VOP2_Real_e64_vi <0x288>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>;
-defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>;
-defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>;
-defm V_CVT_PKRTZ_F16_F32  : VOP2_Real_e64_vi <0x296>;
-defm V_CVT_PK_U16_U32     : VOP2_Real_e64_vi <0x297>;
-defm V_CVT_PK_I16_I32     : VOP2_Real_e64_vi <0x298>;
+defm V_BFM_B32            : VOP2_Real_e64only_vi <0x293>;
+defm V_BCNT_U32_B32       : VOP2_Real_e64only_vi <0x28b>;
+defm V_MBCNT_LO_U32_B32   : VOP2_Real_e64only_vi <0x28c>;
+defm V_MBCNT_HI_U32_B32   : VOP2_Real_e64only_vi <0x28d>;
+defm V_LDEXP_F32          : VOP2_Real_e64only_vi <0x288>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64only_vi <0x1f0>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64only_vi <0x294>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64only_vi <0x295>;
+defm V_CVT_PKRTZ_F16_F32  : VOP2_Real_e64only_vi <0x296>;
+defm V_CVT_PK_U16_U32     : VOP2_Real_e64only_vi <0x297>;
+defm V_CVT_PK_I16_I32     : VOP2_Real_e64only_vi <0x298>;
 
 defm V_ADD_F16            : VOP2_Real_e32e64_vi <0x1f>;
 defm V_SUB_F16            : VOP2_Real_e32e64_vi <0x20>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 217a07488853..ffa6c60d6b1f 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -232,7 +232,6 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
 
 let SubtargetPredicate = isCIVI in {
 
-def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile<VOP_I32_I32_I32>>;
 def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
 
@@ -402,7 +401,6 @@ multiclass VOP3be_Real_ci<bits<9> op> {
   }
 }
 
-defm V_MQSAD_U16_U8     : VOP3_Real_ci <0x172>;
 defm V_QSAD_PK_U16_U8   : VOP3_Real_ci <0x172>;
 defm V_MQSAD_U32_U8     : VOP3_Real_ci <0x175>;
 defm V_MAD_U64_U32      : VOP3be_Real_ci <0x176>;
@@ -426,7 +424,6 @@ multiclass VOP3be_Real_vi<bits<10> op> {
 
 } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
 
-defm V_MQSAD_U16_U8     : VOP3_Real_vi <0x172>;
 defm V_MAD_U64_U32      : VOP3be_Real_vi <0x1E8>;
 defm V_MAD_I64_I32      : VOP3be_Real_vi <0x1E9>;
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 28c407f74125..dd7fe871345a 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -404,21 +404,11 @@ public:
   /// Returns predicate register associated with the given frame instruction.
   unsigned getFramePred(const MachineInstr &MI) const {
     assert(isFrameInstr(MI));
-    if (isFrameSetup(MI))
-      // Operands of ADJCALLSTACKDOWN:
-      // - argument declared in ADJCALLSTACKDOWN pattern:
-      // 0 - frame size
-      // 1 - predicate code (like ARMCC::AL)
-      // - added by predOps:
-      // 2 - predicate reg
-      return MI.getOperand(2).getReg();
-    assert(MI.getOpcode() == ARM::ADJCALLSTACKUP ||
-           MI.getOpcode() == ARM::tADJCALLSTACKUP);
-    // Operands of ADJCALLSTACKUP:
-    // - argument declared in ADJCALLSTACKUP pattern:
+    // Operands of ADJCALLSTACKDOWN/ADJCALLSTACKUP:
+    // - argument declared in the pattern:
     // 0 - frame size
-    // 1 - arg of CALLSEQ_END
-    // 2 - predicate code
+    // 1 - arg of CALLSEQ_START/CALLSEQ_END
+    // 2 - predicate code (like ARMCC::AL)
     // - added by predOps:
     // 3 - predicate reg
     return MI.getOperand(3).getReg();
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 9178c67afa6e..46ac4d0ad933 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -433,7 +433,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   // We now know the size of the stack - update the ADJCALLSTACKDOWN
   // accordingly.
-  CallSeqStart.addImm(ArgHandler.StackSize).add(predOps(ARMCC::AL));
+  CallSeqStart.addImm(ArgHandler.StackSize).addImm(0).add(predOps(ARMCC::AL));
 
   MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP)
       .addImm(ArgHandler.StackSize)
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 56cac855620d..4f6a73b5980d 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -1949,7 +1949,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(AdjStackDown))
-                  .addImm(NumBytes));
+                  .addImm(NumBytes).addImm(0));
 
   // Process the args.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e64582402fe1..f8b584db7b99 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -473,9 +473,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   }
 
   // Use divmod compiler-rt calls for iOS 5.0 and later.
-  if (Subtarget->isTargetWatchOS() ||
-      (Subtarget->isTargetIOS() &&
-       !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
+  if (Subtarget->isTargetMachO() &&
+      !(Subtarget->isTargetIOS() &&
+        Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
   }
@@ -1817,8 +1817,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!isSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain,
-                                 DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
 
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
@@ -7365,7 +7364,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Pair of floats / doubles used to pass the result.
-  Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+  Type *RetTy = StructType::get(ArgTy, ArgTy);
   auto &DL = DAG.getDataLayout();
 
   ArgListTy Args;
@@ -13115,7 +13114,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
 
-  Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
+  Type *RetTy = StructType::get(Ty, Ty);
 
   if (Subtarget->isTargetWindows())
     InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
@@ -13417,9 +13416,9 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
 }
 
 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
-                                         AtomicOrdering Ord, bool IsStore,
-                                         bool IsLoad) const {
+Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                                 Instruction *Inst,
+                                                 AtomicOrdering Ord) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
@@ -13428,7 +13427,7 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
   case AtomicOrdering::Acquire:
     return nullptr; // Nothing to do
   case AtomicOrdering::SequentiallyConsistent:
-    if (!IsStore)
+    if (!Inst->hasAtomicStore())
       return nullptr; // Nothing to do
     /*FALLTHROUGH*/
   case AtomicOrdering::Release:
@@ -13442,9 +13441,9 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
 }
 
-Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
-                                          AtomicOrdering Ord, bool IsStore,
-                                          bool IsLoad) const {
+Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                                  Instruction *Inst,
+                                                  AtomicOrdering Ord) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 08c51b66dfe7..875c06210ae6 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -483,10 +483,10 @@ class InstrItineraryData;
 
     void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
 
-    Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                          bool IsStore, bool IsLoad) const override;
-    Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                           bool IsStore, bool IsLoad) const override;
+    Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                  AtomicOrdering Ord) const override;
+    Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                   AtomicOrdering Ord) const override;
 
     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index a94d6048f02d..d06b7d0896f1 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -16,7 +16,8 @@
 //
 
 // Type profiles.
-def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                           SDTCisVT<1, i32> ]>;
 def SDT_ARMCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
 def SDT_ARMStructByVal : SDTypeProfile<0, 4,
                                        [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
@@ -1968,8 +1969,8 @@ PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
            [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
 
 def ADJCALLSTACKDOWN :
-PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
-           [(ARMcallseq_start timm:$amt)]>;
+PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2, pred:$p), NoItinerary,
+           [(ARMcallseq_start timm:$amt, timm:$amt2)]>;
 }
 
 def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 8048c758e998..bee83dfb6f63 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -284,8 +284,8 @@ def tADJCALLSTACKUP :
             Requires<[IsThumb, IsThumb1Only]>;
 
 def tADJCALLSTACKDOWN :
-  PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
-             [(ARMcallseq_start imm:$amt)]>,
+  PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2), NoItinerary,
+             [(ARMcallseq_start imm:$amt, imm:$amt2)]>,
             Requires<[IsThumb, IsThumb1Only]>;
 }
 
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 2ac3fda9f448..8c680cdf9b47 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -101,14 +101,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   assert(RegBank && "Can't get reg bank for virtual register");
 
   const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
-  (void)DstSize;
-  unsigned SrcReg = I.getOperand(1).getReg();
-  const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
-  (void)SrcSize;
-  // We use copies for trunc, so it's ok for the size of the destination to be
-  // smaller (the higher bits will just be undefined).
-  assert(DstSize <= SrcSize && "Copy with different width?!");
-
   assert((RegBank->getID() == ARM::GPRRegBankID ||
           RegBank->getID() == ARM::FPRRegBankID) &&
          "Unsupported reg bank");
@@ -135,28 +127,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   return true;
 }
 
-static bool selectFAdd(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
-                       MachineRegisterInfo &MRI) {
-  assert(TII.getSubtarget().hasVFP2() && "Can't select fp add without vfp");
-
-  LLT Ty = MRI.getType(MIB->getOperand(0).getReg());
-  unsigned ValSize = Ty.getSizeInBits();
-
-  if (ValSize == 32) {
-    if (TII.getSubtarget().useNEONForSinglePrecisionFP())
-      return false;
-    MIB->setDesc(TII.get(ARM::VADDS));
-  } else {
-    assert(ValSize == 64 && "Unsupported size for floating point value");
-    if (TII.getSubtarget().isFPOnlySP())
-      return false;
-    MIB->setDesc(TII.get(ARM::VADDD));
-  }
-  MIB.add(predOps(ARMCC::AL));
-
-  return true;
-}
-
 static bool selectSequence(MachineInstrBuilder &MIB,
                            const ARMBaseInstrInfo &TII,
                            MachineRegisterInfo &MRI,
@@ -352,6 +322,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     }
     break;
   }
+  case G_ANYEXT:
   case G_TRUNC: {
     // The high bits are undefined, so there's nothing special to do, just
     // treat it as a copy.
@@ -362,12 +333,12 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
 
     if (SrcRegBank.getID() != DstRegBank.getID()) {
-      DEBUG(dbgs() << "G_TRUNC operands on different register banks\n");
+      DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n");
       return false;
     }
 
     if (SrcRegBank.getID() != ARM::GPRRegBankID) {
-      DEBUG(dbgs() << "G_TRUNC on non-GPR not supported yet\n");
+      DEBUG(dbgs() << "G_TRUNC/G_ANYEXT on non-GPR not supported yet\n");
       return false;
     }
 
@@ -393,10 +364,6 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     }
     MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
     break;
-  case G_FADD:
-    if (!selectFAdd(MIB, TII, MRI))
-      return false;
-    break;
   case G_FRAME_INDEX:
     // Add 0 to the given frame index and hope it will eventually be folded into
     // the user(s).
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 9b86030fdd29..5bf6c7aed6b8 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -45,9 +45,11 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
     setAction({Op, 1, p0}, Legal);
   }
 
-  for (unsigned Op : {G_ADD, G_SUB, G_MUL})
-    for (auto Ty : {s1, s8, s16, s32})
-      setAction({Op, Ty}, Legal);
+  for (unsigned Op : {G_ADD, G_SUB, G_MUL}) {
+    for (auto Ty : {s1, s8, s16})
+      setAction({Op, Ty}, WidenScalar);
+    setAction({Op, s32}, Legal);
+  }
 
   for (unsigned Op : {G_SDIV, G_UDIV}) {
     for (auto Ty : {s8, s16})
diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
index 581d5fe159fd..7e4d598a6e0b 100644
--- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
+++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -88,13 +88,15 @@ bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) {
       }
     }
   }
+  bool Changed = false;
   // Remove the tagged DMB
   for (auto MI : ToRemove) {
     MI->eraseFromParent();
     ++NumDMBsRemoved;
+    Changed = true;
   }
 
-  return NumDMBsRemoved > 0;
+  return Changed;
 }
 
 /// createARMOptimizeBarriersPass - Returns an instance of the remove double
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 13a32211f88c..a20997c95cd9 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -225,6 +225,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case G_UDIV:
   case G_SEXT:
   case G_ZEXT:
+  case G_ANYEXT:
   case G_TRUNC:
   case G_GEP:
     // FIXME: We're abusing the fact that everything lives in a GPR for now; in
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index d09f3ecbaa28..5583d6148b08 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -13,7 +13,9 @@
 #include "ARM.h"
 #include "ARMCallLowering.h"
 #include "ARMLegalizerInfo.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
 #include "ARMRegisterBankInfo.h"
+#endif
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp
index c297865db820..0ec8e8b08ceb 100644
--- a/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/lib/Target/AVR/AVRFrameLowering.cpp
@@ -375,7 +375,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
 
   DebugLoc DL = MI->getDebugLoc();
   unsigned int Opcode = MI->getOpcode();
-  int Amount = MI->getOperand(0).getImm();
+  int Amount = TII.getFrameSize(*MI);
 
   // Adjcallstackup does not need to allocate stack space for the call, instead
   // we insert push instructions that will allocate the necessary stack.
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index f0ab6acedad1..ef9c00e4b784 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -361,7 +361,7 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
 
-  Type *RetTy = (Type *)StructType::get(Ty, Ty, nullptr);
+  Type *RetTy = (Type *)StructType::get(Ty, Ty);
 
   SDLoc dl(Op);
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -1166,8 +1166,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
-                               DL);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
 
@@ -1611,8 +1610,9 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 
-  MachineFunction::iterator I = MBB->getParent()->begin();
-  ++I;
+  MachineFunction::iterator I;
+  for (I = MF->begin(); I != MF->end() && &(*I) != MBB; ++I);
+  if (I != MF->end()) ++I;
   MF->insert(I, trueMBB);
   MF->insert(I, falseMBB);
 
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index 1b6547ef7795..06ad2b3ffdf8 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -17,7 +17,7 @@ include "AVRInstrFormats.td"
 // AVR Type Profiles
 //===----------------------------------------------------------------------===//
 
-def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDT_AVRCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDT_AVRCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 def SDT_AVRWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
@@ -333,9 +333,9 @@ let Defs = [SP, SREG],
 Uses = [SP] in
 {
   def ADJCALLSTACKDOWN : Pseudo<(outs),
-                                (ins i16imm:$amt),
+                                (ins i16imm:$amt, i16imm:$amt2),
                                 "#ADJCALLSTACKDOWN",
-                                [(AVRcallseq_start timm:$amt)]>;
+                                [(AVRcallseq_start timm:$amt, timm:$amt2)]>;
 
   // R31R30 is used to update SP, since it is a scratch reg and this instruction
   // is placed after the function call then R31R30 should be always free.
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index 2813e24d2ac7..11a47bad78ba 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -52,7 +52,6 @@ AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
 BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
-  const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
 
   // Reserve the intermediate result registers r1 and r2
   // The result of instructions like 'mul' is always stored here.
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index b9b3dff95c0a..6897161c903c 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -257,8 +257,7 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   auto PtrVT = getPointerTy(MF.getDataLayout());
-  Chain = DAG.getCALLSEQ_START(
-      Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
 
   SmallVector<std::pair<unsigned, SDValue>, MaxArgs> RegsToPass;
 
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index 93ee24371c4d..c6c0ff587c6b 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -16,7 +16,8 @@ include "BPFInstrFormats.td"
 // Instruction Operands and Patterns
 
 // These are target-independent nodes, but have target-specific formats.
-def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
+                                          SDTCisVT<1, iPTR>]>;
 def SDT_BPFCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 def SDT_BPFCall         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 def SDT_BPFSetFlag      : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>]>;
@@ -445,9 +446,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1,
 
 // ADJCALLSTACKDOWN/UP pseudo insns
 let Defs = [R11], Uses = [R11] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
-                              "#ADJCALLSTACKDOWN $amt",
-                              [(BPFcallseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                              "#ADJCALLSTACKDOWN $amt1 $amt2",
+                              [(BPFcallseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                               "#ADJCALLSTACKUP $amt1 $amt2",
                               [(BPFcallseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 861af94f1e38..1dffebe97f2d 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -848,8 +848,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   SDValue Glue;
   if (!IsTailCall) {
-    SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true);
-    Chain = DAG.getCALLSEQ_START(Chain, C, dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
     Glue = Chain.getValue(1);
   }
 
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 5a5799dbe009..e4df7ff5c200 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1209,7 +1209,7 @@ bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
 
   KnownBits Known(T->getBitWidth());
   computeKnownBits(V, Known, DL);
-  return Known.Zero.countLeadingOnes() >= IterCount;
+  return Known.countMinLeadingZeros() >= IterCount;
 }
 
 
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index 32503d111c24..81b5e10c1173 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -714,7 +714,8 @@ def: Pat<(i1 0), (PS_false)>;
 def: Pat<(i1 1), (PS_true)>;
 
 // Pseudo instructions.
-def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32> ]>;
 def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                         SDTCisVT<1, i32> ]>;
 
@@ -732,8 +733,8 @@ def HexagonTCRet : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
                           [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
 
-def: Pat<(callseq_start timm:$amt),
-          (ADJCALLSTACKDOWN imm:$amt)>;
+def: Pat<(callseq_start timm:$amt, timm:$amt2),
+          (ADJCALLSTACKDOWN imm:$amt, imm:$amt2)>;
 def: Pat<(callseq_end timm:$amt1, timm:$amt2),
          (ADJCALLSTACKUP imm:$amt1, imm:$amt2)>;
 
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
index 8c2caea2d5c5..0f99dfe342b8 100644
--- a/lib/Target/Hexagon/HexagonPseudo.td
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -80,7 +80,7 @@ def PS_false : InstHexagon<(outs PredRegs:$dst), (ins), "",
                [(set I1:$dst, 0)], "", C2_andn.Itinerary, TypeCR>;
 
 let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                               ".error \"should not emit\" ", []>;
 
 let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp
index d156294a0b0c..0a9cac2565f2 100644
--- a/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "LanaiISelLowering.h"
 #include "Lanai.h"
 #include "LanaiCondCode.h"
-#include "LanaiISelLowering.h"
 #include "LanaiMachineFunctionInfo.h"
 #include "LanaiSubtarget.h"
 #include "LanaiTargetObjectFile.h"
@@ -38,10 +38,11 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetCallingConv.h"
@@ -649,10 +650,7 @@ SDValue LanaiTargetLowering::LowerCCCCallTo(
     ByValArgs.push_back(FIPtr);
   }
 
-  Chain = DAG.getCALLSEQ_START(
-      Chain,
-      DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true),
-      DL);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -1502,3 +1500,24 @@ SDValue LanaiTargetLowering::PerformDAGCombine(SDNode *N,
 
   return SDValue();
 }
+
+void LanaiTargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  unsigned BitWidth = Known.getBitWidth();
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case LanaiISD::SETCC:
+    Known = KnownBits(BitWidth);
+    Known.Zero.setBits(1, BitWidth);
+    break;
+  case LanaiISD::SELECT_CC:
+    KnownBits Known2;
+    DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
+    Known.Zero &= Known2.Zero;
+    Known.One &= Known2.One;
+    break;
+  }
+}
diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h
index c2fba4f9d167..49ad52a39771 100644
--- a/lib/Target/Lanai/LanaiISelLowering.h
+++ b/lib/Target/Lanai/LanaiISelLowering.h
@@ -106,6 +106,11 @@ public:
 
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
+  void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
 private:
   SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
                          CallingConv::ID CallConv, bool IsVarArg,
diff --git a/lib/Target/Lanai/LanaiInstrInfo.td b/lib/Target/Lanai/LanaiInstrInfo.td
index 285fca11737d..776fee101dfe 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.td
+++ b/lib/Target/Lanai/LanaiInstrInfo.td
@@ -22,7 +22,8 @@ include "LanaiInstrFormats.td"
 // -------------------------------------------------- //
 
 //  These are target-independent nodes, but have target-specific formats.
-def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>]>;
 def SDT_LanaiCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
                                           SDTCisVT<1, i32>]>;
 def SDT_LanaiCall         : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
@@ -750,9 +751,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber SP.
 let Defs = [SP], Uses = [SP] in {
-  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                                "#ADJCALLSTACKDOWN $amt",
-                                [(CallSeqStart timm:$amt)]>;
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                                "#ADJCALLSTACKDOWN $amt1 $amt2",
+                                [(CallSeqStart timm:$amt1, timm:$amt2)]>;
   def ADJCALLSTACKUP   : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                                 "#ADJCALLSTACKUP $amt1 $amt2",
                                 [(CallSeqEnd timm:$amt1, timm:$amt2)]>;
@@ -770,9 +771,6 @@ let Uses = [SR] in {
                     [(set (i32 GPR:$Rs1), (LanaiSetCC imm:$DDDI))]>;
 }
 
-// SCC's output is already 1-bit so and'ing with 1 is redundant.
-def : Pat<(and (LanaiSetCC imm:$DDDI), 1), (SCC imm:$DDDI)>;
-
 // Select with hardware support
 let Uses = [SR], isSelect = 1 in {
   def SELECT : InstRR<0b111, (outs GPR:$Rd),
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index f1cb0b6c031b..b4ff8f66c55f 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -236,7 +236,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
     // adjcallstackdown instruction into 'add SP, <amt>'
     // TODO: consider using push / pop instead of sub + store / add
     MachineInstr &Old = *I;
-    uint64_t Amount = Old.getOperand(0).getImm();
+    uint64_t Amount = TII.getFrameSize(Old);
     if (Amount != 0) {
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
@@ -252,8 +252,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
       } else {
         assert(Old.getOpcode() == TII.getCallFrameDestroyOpcode());
         // factor out the amount the callee already popped.
-        uint64_t CalleeAmt = Old.getOperand(1).getImm();
-        Amount -= CalleeAmt;
+        Amount -= TII.getFramePoppedByCallee(Old);
         if (Amount)
           New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::ADD16ri),
                         MSP430::SP)
@@ -272,7 +271,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
   } else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) {
     // If we are performing frame pointer elimination and if the callee pops
     // something off the stack pointer, add it back.
-    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+    if (uint64_t CalleeAmt = TII.getFramePoppedByCallee(*I)) {
       MachineInstr &Old = *I;
       MachineInstr *New =
           BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP)
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 40b1dd3cc2eb..cc6e64043f54 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -40,21 +40,24 @@ using namespace llvm;
 
 typedef enum {
   NoHWMult,
-  HWMultIntr,
-  HWMultNoIntr
+  HWMult16,
+  HWMult32,
+  HWMultF5
 } HWMultUseMode;
 
 static cl::opt<HWMultUseMode>
-HWMultMode("msp430-hwmult-mode", cl::Hidden,
+HWMultMode("mhwmult", cl::Hidden,
            cl::desc("Hardware multiplier use mode"),
-           cl::init(HWMultNoIntr),
+           cl::init(NoHWMult),
            cl::values(
-             clEnumValN(NoHWMult, "no",
+             clEnumValN(NoHWMult, "none",
                 "Do not use hardware multiplier"),
-             clEnumValN(HWMultIntr, "interrupts",
-                "Assume hardware multiplier can be used inside interrupts"),
-             clEnumValN(HWMultNoIntr, "use",
-                "Assume hardware multiplier cannot be used inside interrupts")));
+             clEnumValN(HWMult16, "16bit",
+                "Use 16-bit hardware multiplier"),
+             clEnumValN(HWMult32, "32bit",
+                "Use 32-bit hardware multiplier"),
+             clEnumValN(HWMultF5, "f5series",
+                "Use F5 series hardware multiplier")));
 
 MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
                                            const MSP430Subtarget &STI)
@@ -131,29 +134,29 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,   Expand);
 
   // FIXME: Implement efficiently multiplication by a constant
-  setOperationAction(ISD::MUL,              MVT::i8,    Expand);
-  setOperationAction(ISD::MULHS,            MVT::i8,    Expand);
-  setOperationAction(ISD::MULHU,            MVT::i8,    Expand);
-  setOperationAction(ISD::SMUL_LOHI,        MVT::i8,    Expand);
-  setOperationAction(ISD::UMUL_LOHI,        MVT::i8,    Expand);
-  setOperationAction(ISD::MUL,              MVT::i16,   Expand);
+  setOperationAction(ISD::MUL,              MVT::i8,    Promote);
+  setOperationAction(ISD::MULHS,            MVT::i8,    Promote);
+  setOperationAction(ISD::MULHU,            MVT::i8,    Promote);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i8,    Promote);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i8,    Promote);
+  setOperationAction(ISD::MUL,              MVT::i16,   LibCall);
   setOperationAction(ISD::MULHS,            MVT::i16,   Expand);
   setOperationAction(ISD::MULHU,            MVT::i16,   Expand);
   setOperationAction(ISD::SMUL_LOHI,        MVT::i16,   Expand);
   setOperationAction(ISD::UMUL_LOHI,        MVT::i16,   Expand);
 
-  setOperationAction(ISD::UDIV,             MVT::i8,    Expand);
-  setOperationAction(ISD::UDIVREM,          MVT::i8,    Expand);
-  setOperationAction(ISD::UREM,             MVT::i8,    Expand);
-  setOperationAction(ISD::SDIV,             MVT::i8,    Expand);
-  setOperationAction(ISD::SDIVREM,          MVT::i8,    Expand);
-  setOperationAction(ISD::SREM,             MVT::i8,    Expand);
-  setOperationAction(ISD::UDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::UDIV,             MVT::i8,    Promote);
+  setOperationAction(ISD::UDIVREM,          MVT::i8,    Promote);
+  setOperationAction(ISD::UREM,             MVT::i8,    Promote);
+  setOperationAction(ISD::SDIV,             MVT::i8,    Promote);
+  setOperationAction(ISD::SDIVREM,          MVT::i8,    Promote);
+  setOperationAction(ISD::SREM,             MVT::i8,    Promote);
+  setOperationAction(ISD::UDIV,             MVT::i16,   LibCall);
   setOperationAction(ISD::UDIVREM,          MVT::i16,   Expand);
-  setOperationAction(ISD::UREM,             MVT::i16,   Expand);
-  setOperationAction(ISD::SDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::UREM,             MVT::i16,   LibCall);
+  setOperationAction(ISD::SDIV,             MVT::i16,   LibCall);
   setOperationAction(ISD::SDIVREM,          MVT::i16,   Expand);
-  setOperationAction(ISD::SREM,             MVT::i16,   Expand);
+  setOperationAction(ISD::SREM,             MVT::i16,   LibCall);
 
   // varargs support
   setOperationAction(ISD::VASTART,          MVT::Other, Custom);
@@ -162,15 +165,183 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VACOPY,           MVT::Other, Expand);
   setOperationAction(ISD::JumpTable,        MVT::i16,   Custom);
 
-  // Libcalls names.
-  if (HWMultMode == HWMultIntr) {
-    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw");
-    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw");
-  } else if (HWMultMode == HWMultNoIntr) {
-    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw_noint");
-    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
+  // EABI Libcalls - EABI Section 6.2
+  const struct {
+    const RTLIB::Libcall Op;
+    const char * const Name;
+    const ISD::CondCode Cond;
+  } LibraryCalls[] = {
+    // Floating point conversions - EABI Table 6
+    { RTLIB::FPROUND_F64_F32,   "__mspabi_cvtdf",   ISD::SETCC_INVALID },
+    { RTLIB::FPEXT_F32_F64,     "__mspabi_cvtfd",   ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOSINT_F64_I16,  "__mspabi_fixdi", ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F64_I32,  "__mspabi_fixdli",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F64_I64,  "__mspabi_fixdlli", ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOUINT_F64_I16,  "__mspabi_fixdu", ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F64_I32,  "__mspabi_fixdul",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F64_I64,  "__mspabi_fixdull", ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOSINT_F32_I16,  "__mspabi_fixfi", ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F32_I32,  "__mspabi_fixfli",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F32_I64,  "__mspabi_fixflli", ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOUINT_F32_I16,  "__mspabi_fixfu", ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F32_I32,  "__mspabi_fixful",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F32_I64,  "__mspabi_fixfull", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::SINTTOFP_I16_F64,  "__mspabi_fltid", ISD::SETCC_INVALID },
+    { RTLIB::SINTTOFP_I32_F64,  "__mspabi_fltlid",  ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::SINTTOFP_I64_F64,  "__mspabi_fltllid", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::UINTTOFP_I16_F64,  "__mspabi_fltud", ISD::SETCC_INVALID },
+    { RTLIB::UINTTOFP_I32_F64,  "__mspabi_fltuld",  ISD::SETCC_INVALID },
+    // The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::UINTTOFP_I64_F64,  "__mspabi_fltulld", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::SINTTOFP_I16_F32,  "__mspabi_fltif", ISD::SETCC_INVALID },
+    { RTLIB::SINTTOFP_I32_F32,  "__mspabi_fltlif",  ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::SINTTOFP_I64_F32,  "__mspabi_fltllif", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::UINTTOFP_I16_F32,  "__mspabi_fltuf", ISD::SETCC_INVALID },
+    { RTLIB::UINTTOFP_I32_F32,  "__mspabi_fltulf",  ISD::SETCC_INVALID },
+    // The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::UINTTOFP_I64_F32,  "__mspabi_fltullf", ISD::SETCC_INVALID },
+
+    // Floating point comparisons - EABI Table 7
+    { RTLIB::OEQ_F64, "__mspabi_cmpd", ISD::SETEQ },
+    { RTLIB::UNE_F64, "__mspabi_cmpd", ISD::SETNE },
+    { RTLIB::OGE_F64, "__mspabi_cmpd", ISD::SETGE },
+    { RTLIB::OLT_F64, "__mspabi_cmpd", ISD::SETLT },
+    { RTLIB::OLE_F64, "__mspabi_cmpd", ISD::SETLE },
+    { RTLIB::OGT_F64, "__mspabi_cmpd", ISD::SETGT },
+    { RTLIB::OEQ_F32, "__mspabi_cmpf", ISD::SETEQ },
+    { RTLIB::UNE_F32, "__mspabi_cmpf", ISD::SETNE },
+    { RTLIB::OGE_F32, "__mspabi_cmpf", ISD::SETGE },
+    { RTLIB::OLT_F32, "__mspabi_cmpf", ISD::SETLT },
+    { RTLIB::OLE_F32, "__mspabi_cmpf", ISD::SETLE },
+    { RTLIB::OGT_F32, "__mspabi_cmpf", ISD::SETGT },
+
+    // Floating point arithmetic - EABI Table 8
+    { RTLIB::ADD_F64,  "__mspabi_addd", ISD::SETCC_INVALID },
+    { RTLIB::ADD_F32,  "__mspabi_addf", ISD::SETCC_INVALID },
+    { RTLIB::DIV_F64,  "__mspabi_divd", ISD::SETCC_INVALID },
+    { RTLIB::DIV_F32,  "__mspabi_divf", ISD::SETCC_INVALID },
+    { RTLIB::MUL_F64,  "__mspabi_mpyd", ISD::SETCC_INVALID },
+    { RTLIB::MUL_F32,  "__mspabi_mpyf", ISD::SETCC_INVALID },
+    { RTLIB::SUB_F64,  "__mspabi_subd", ISD::SETCC_INVALID },
+    { RTLIB::SUB_F32,  "__mspabi_subf", ISD::SETCC_INVALID },
+    // The following are NOT implemented in libgcc
+    // { RTLIB::NEG_F64,  "__mspabi_negd", ISD::SETCC_INVALID },
+    // { RTLIB::NEG_F32,  "__mspabi_negf", ISD::SETCC_INVALID },
+
+    // TODO: SLL/SRA/SRL are in libgcc, RLL isn't
+
+    // Universal Integer Operations - EABI Table 9
+    { RTLIB::SDIV_I16,   "__mspabi_divi", ISD::SETCC_INVALID },
+    { RTLIB::SDIV_I32,   "__mspabi_divli", ISD::SETCC_INVALID },
+    { RTLIB::SDIV_I64,   "__mspabi_divlli", ISD::SETCC_INVALID },
+    { RTLIB::UDIV_I16,   "__mspabi_divu", ISD::SETCC_INVALID },
+    { RTLIB::UDIV_I32,   "__mspabi_divul", ISD::SETCC_INVALID },
+    { RTLIB::UDIV_I64,   "__mspabi_divull", ISD::SETCC_INVALID },
+    { RTLIB::SREM_I16,   "__mspabi_remi", ISD::SETCC_INVALID },
+    { RTLIB::SREM_I32,   "__mspabi_remli", ISD::SETCC_INVALID },
+    { RTLIB::SREM_I64,   "__mspabi_remlli", ISD::SETCC_INVALID },
+    { RTLIB::UREM_I16,   "__mspabi_remu", ISD::SETCC_INVALID },
+    { RTLIB::UREM_I32,   "__mspabi_remul", ISD::SETCC_INVALID },
+    { RTLIB::UREM_I64,   "__mspabi_remull", ISD::SETCC_INVALID },
+
+  };
+
+  for (const auto &LC : LibraryCalls) {
+    setLibcallName(LC.Op, LC.Name);
+    if (LC.Cond != ISD::SETCC_INVALID)
+      setCmpLibcallCC(LC.Op, LC.Cond);
+  }
+
+  if (HWMultMode == HWMult16) {
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi_hw" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl_hw" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll_hw" },
+      // TODO The __mspabi_mpysl*_hw functions ARE implemented in libgcc
+      // TODO The __mspabi_mpyul*_hw functions ARE implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+  } else if (HWMultMode == HWMult32) {
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi_hw" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl_hw32" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll_hw32" },
+      // TODO The __mspabi_mpysl*_hw32 functions ARE implemented in libgcc
+      // TODO The __mspabi_mpyul*_hw32 functions ARE implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+  } else if (HWMultMode == HWMultF5) {
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi_f5hw" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl_f5hw" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll_f5hw" },
+      // TODO The __mspabi_mpysl*_f5hw functions ARE implemented in libgcc
+      // TODO The __mspabi_mpyul*_f5hw functions ARE implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+  } else { // NoHWMult
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll" },
+      // The __mspabi_mpysl* functions are NOT implemented in libgcc
+      // The __mspabi_mpyul* functions are NOT implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::MSP430_BUILTIN);
   }
 
+  // Several of the runtime library functions use a special calling conv
+  setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN);
+  // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
+
   setMinFunctionAlignment(1);
   setPrefFunctionAlignment(2);
 }
@@ -281,10 +452,27 @@ template<typename ArgT>
 static void AnalyzeArguments(CCState &State,
                              SmallVectorImpl<CCValAssign> &ArgLocs,
                              const SmallVectorImpl<ArgT> &Args) {
-  static const MCPhysReg RegList[] = {
+  static const MCPhysReg CRegList[] = {
     MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
   };
-  static const unsigned NbRegs = array_lengthof(RegList);
+  static const unsigned CNbRegs = array_lengthof(CRegList);
+  static const MCPhysReg BuiltinRegList[] = {
+    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+    MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
+  };
+  static const unsigned BuiltinNbRegs = array_lengthof(BuiltinRegList);
+
+  ArrayRef<MCPhysReg> RegList;
+  unsigned NbRegs;
+
+  bool Builtin = (State.getCallingConv() == CallingConv::MSP430_BUILTIN);
+  if (Builtin) {
+    RegList = BuiltinRegList;
+    NbRegs = BuiltinNbRegs;
+  } else {
+    RegList = CRegList;
+    NbRegs = CNbRegs;
+  }
 
   if (State.isVarArg()) {
     AnalyzeVarArgs(State, Args);
@@ -294,6 +482,11 @@ static void AnalyzeArguments(CCState &State,
   SmallVector<unsigned, 4> ArgsParts;
   ParseFunctionArgs(Args, ArgsParts);
 
+  if (Builtin) {
+    assert(ArgsParts.size() == 2 &&
+        "Builtin calling convention requires two arguments");
+  }
+
   unsigned RegsLeft = NbRegs;
   bool UsedStack = false;
   unsigned ValNo = 0;
@@ -323,6 +516,11 @@ static void AnalyzeArguments(CCState &State,
 
     unsigned Parts = ArgsParts[i];
 
+    if (Builtin) {
+      assert(Parts == 4 &&
+          "Builtin calling convention requires 64-bit arguments");
+    }
+
     if (!UsedStack && Parts == 2 && RegsLeft == 1) {
       // Special case for 32-bit register split, see EABI section 3.3.3
       unsigned Reg = State.AllocateReg(RegList);
@@ -400,6 +598,7 @@ MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   switch (CallConv) {
   default:
     llvm_unreachable("Unsupported calling convention");
+  case CallingConv::MSP430_BUILTIN:
   case CallingConv::Fast:
   case CallingConv::C:
     return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
@@ -598,7 +797,6 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
 /// LowerCCCCallTo - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
-// TODO: sret.
 SDValue MSP430TargetLowering::LowerCCCCallTo(
     SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
     bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -615,8 +813,7 @@ SDValue MSP430TargetLowering::LowerCCCCallTo(
   unsigned NumBytes = CCInfo.getNextStackOffset();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  Chain = DAG.getCALLSEQ_START(Chain,
-                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index e3259bd6a7bc..d81f17e753c5 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -85,6 +85,12 @@ public:
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+
+  int64_t getFramePoppedByCallee(const MachineInstr &I) const {
+    assert(isFrameInstr(I) && "Not a frame instruction");
+    assert(I.getOperand(1).getImm() >= 0 && "Size must not be negative");
+    return I.getOperand(1).getImm();
+  }
 };
 
 }
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 22fc2474fae6..1cd18611e52c 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -23,7 +23,8 @@ class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
 // Type Profiles.
 //===----------------------------------------------------------------------===//
 def SDT_MSP430Call         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
-def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>,
+                                             SDTCisVT<1, i16>]>;
 def SDT_MSP430CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDT_MSP430Wrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                   SDTCisPtrTy<0>]>;
@@ -113,9 +114,9 @@ def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber SR.
 let Defs = [SP, SR], Uses = [SP] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
                               "#ADJCALLSTACKDOWN",
-                              [(MSP430callseq_start timm:$amt)]>;
+                              [(MSP430callseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
                               "#ADJCALLSTACKUP",
                               [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
@@ -209,7 +210,7 @@ let isCall = 1 in
   // a use to prevent stack-pointer assignments that appear immediately
   // before calls from potentially appearing dead. Uses for argument
   // registers are added manually.
-  let Defs = [R12, R13, R14, R15, SR],
+  let Defs = [R11, R12, R13, R14, R15, SR],
       Uses = [SP] in {
     def CALLi     : II16i<0x0,
                           (outs), (ins i16imm:$dst),
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 81cd9d1ad3f8..9600bc28f100 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -41,12 +41,12 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const Function* F = MF->getFunction();
   static const MCPhysReg CalleeSavedRegs[] = {
     MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
-    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+    MSP430::R8, MSP430::R9, MSP430::R10,
     0
   };
   static const MCPhysReg CalleeSavedRegsFP[] = {
     MSP430::R5, MSP430::R6, MSP430::R7,
-    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+    MSP430::R8, MSP430::R9, MSP430::R10,
     0
   };
   static const MCPhysReg CalleeSavedRegsIntr[] = {
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 21c99da0922d..b83f44a74d5b 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -1133,7 +1133,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
   if (NumBytes < 16)
     NumBytes = 16;
 
-  emitInst(Mips::ADJCALLSTACKDOWN).addImm(16);
+  emitInst(Mips::ADJCALLSTACKDOWN).addImm(16).addImm(0);
   // Process the args.
   MVT firstMVT;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 8f39ebd42a5c..78bae6954c3c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -2787,7 +2787,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true);
 
   if (!IsTailCall)
-    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
+    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffset, 0, DL);
 
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP,
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index b90077d7807d..8761946b8dbb 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -21,7 +21,7 @@ def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
                                                 SDTCisSameAs<1, 2>,
                                                 SDTCisSameAs<3, 4>,
                                                 SDTCisInt<4>]>;
-def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>;
 def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
@@ -1719,8 +1719,8 @@ let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in {
 }
 
 let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
-def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
-                                  [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                                  [(callseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                                   [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index 68dcbdfb4211..f8d9c34556bc 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -257,7 +257,7 @@ bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
 
   // Get the instruction that loads the function address from the GOT.
   Reg = MO->getReg();
-  Val = (Value*)nullptr;
+  Val = nullptr;
   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI = MRI.getVRegDef(Reg);
 
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 61fdda8aa109..ebaaf42bc64e 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1430,8 +1430,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     return Chain;
 
   SDValue tempChain = Chain;
-  Chain = DAG.getCALLSEQ_START(
-      Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
   SDValue InFlag = Chain.getValue(1);
 
   unsigned paramCount = 0;
@@ -1549,7 +1548,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
           Chain = DAG.getMemIntrinsicNode(
               Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
-              TheStoreType, MachinePointerInfo(), EltAlign);
+              TheStoreType, MachinePointerInfo(), EltAlign,
+              /* Volatile */ false, /* ReadMem */ false,
+              /* WriteMem */ true, /* Size */ 0);
           InFlag = Chain.getValue(1);
 
           // Cleanup.
@@ -1609,7 +1610,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                  theVal, InFlag };
       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
                                       CopyParamOps, elemtype,
-                                      MachinePointerInfo());
+                                      MachinePointerInfo(), /* Align */ 0,
+                                      /* Volatile */ false, /* ReadMem */ false,
+                                      /* WriteMem */ true, /* Size */ 0);
 
       InFlag = Chain.getValue(1);
     }
@@ -1795,7 +1798,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
             DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
         SDValue RetVal = DAG.getMemIntrinsicNode(
             Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
-            MachinePointerInfo(), EltAlign);
+            MachinePointerInfo(), EltAlign, /* Volatile */ false,
+            /* ReadMem */ true, /* WriteMem */ false, /* Size */ 0);
 
         for (unsigned j = 0; j < NumElts; ++j) {
           SDValue Ret = RetVal.getValue(j);
@@ -2579,7 +2583,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
       Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
                                       StoreOperands, TheStoreType,
-                                      MachinePointerInfo(), 1);
+                                      MachinePointerInfo(), /* Align */ 1,
+                                      /* Volatile */ false, /* ReadMem */ false,
+                                      /* WriteMem */ true, /* Size */ 0);
       // Cleanup vector state.
       StoreOperands.clear();
     }
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 9378b29a9d0e..b5b5ea1ed639 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3101,7 +3101,8 @@ def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
           (CBranchOther Int1Regs:$a, bb:$target)>;
 
 // Call
-def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>]>;
 def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
@@ -3126,10 +3127,10 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
    : NVPTXInst<outs, ins, asmstr, pattern>;
 
 def Callseq_Start :
-  NVPTXInst<(outs), (ins i32imm:$amt),
-            "\\{ // callseq $amt\n"
+  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+            "\\{ // callseq $amt1, $amt2\n"
             "\t.reg .b32 temp_param_reg;",
-           [(callseq_start timm:$amt)]>;
+            [(callseq_start timm:$amt1, timm:$amt2)]>;
 def Callseq_End :
   NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
             "\\} // callseq $amt1",
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 12ffbfdeacc1..11d22377611b 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -204,6 +204,17 @@ static const unsigned G8Regs[] = {
   PPC::X28, PPC::X29, PPC::X30, PPC::X31
 };
 
+static const unsigned G80Regs[] = {
+  PPC::ZERO8, PPC::X1, PPC::X2, PPC::X3,
+  PPC::X4, PPC::X5, PPC::X6, PPC::X7,
+  PPC::X8, PPC::X9, PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+
 static const unsigned QFRegs[] = {
   PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3,
   PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
@@ -301,6 +312,12 @@ static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
   return decodeRegisterClass(Inst, RegNo, G8Regs);
 }
 
+static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, G80Regs);
+}
+
 #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
 #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
 
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 609d959c6d08..84bb9ec56800 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -95,7 +95,8 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
   
-  if (MI->getOpcode() == PPC::RLDICR) {
+  if (MI->getOpcode() == PPC::RLDICR ||
+      MI->getOpcode() == PPC::RLDICR_32) {
     unsigned char SH = MI->getOperand(2).getImm();
     unsigned char ME = MI->getOperand(3).getImm();
     // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 9b91b9ab8f82..2fc8654deeab 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -1330,7 +1330,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
   // Issue CALLSEQ_START.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(TII.getCallFrameSetupOpcode()))
-    .addImm(NumBytes);
+    .addImm(NumBytes).addImm(0);
 
   // Prepare to assign register arguments.  Every argument uses up a
   // GPR protocol register even if it's passed in a floating-point
@@ -2246,6 +2246,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
     }
 
     case PPC::EXTSW:
+    case PPC::EXTSW_32:
     case PPC::EXTSW_32_64: {
       if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
         return false;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 1b0402bf003d..5fa7b2c6bfb1 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -54,6 +54,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -68,6 +69,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ppc-codegen"
 
+STATISTIC(NumSextSetcc,
+          "Number of (sext(setcc)) nodes expanded into GPR sequence.");
+STATISTIC(NumZextSetcc,
+          "Number of (zext(setcc)) nodes expanded into GPR sequence.");
+STATISTIC(SignExtensionsAdded,
+          "Number of sign extensions for compare inputs added.");
+STATISTIC(ZeroExtensionsAdded,
+          "Number of zero extensions for compare inputs added.");
 // FIXME: Remove this once the bug has been fixed!
 cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
 cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
@@ -252,7 +261,28 @@ namespace {
 #include "PPCGenDAGISel.inc"
 
 private:
+    // Conversion type for interpreting results of a 32-bit instruction as
+    // a 64-bit value or vice versa.
+    enum ExtOrTruncConversion { Ext, Trunc };
+
+    // Modifiers to guide how an ISD::SETCC node's result is to be computed
+    // in a GPR.
+    // ZExtOrig - use the original condition code, zero-extend value
+    // ZExtInvert - invert the condition code, zero-extend value
+    // SExtOrig - use the original condition code, sign-extend value
+    // SExtInvert - invert the condition code, sign-extend value
+    enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert };
+
     bool trySETCC(SDNode *N);
+    bool tryEXTEND(SDNode *N);
+    SDValue signExtendInputIfNeeded(SDValue Input);
+    SDValue zeroExtendInputIfNeeded(SDValue Input);
+    SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv);
+    SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                int64_t RHSValue, SDLoc dl);
+    SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                int64_t RHSValue, SDLoc dl);
+    SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts);
 
     void PeepholePPC64();
     void PeepholePPC64ZExt();
@@ -2471,6 +2501,225 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
   return true;
 }
 
+/// If this node is a sign/zero extension of an integer comparison,
+/// it can usually be computed in GPR's rather than using comparison
+/// instructions and ISEL. We only do this on 64-bit targets for now
+/// as the code is specialized for 64-bit (it uses 64-bit instructions
+/// and assumes 64-bit registers).
+bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) {
+  if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
+    return false;
+  assert((N->getOpcode() == ISD::ZERO_EXTEND ||
+          N->getOpcode() == ISD::SIGN_EXTEND) &&
+          "Expecting a zero/sign extend node!");
+
+  if (N->getOperand(0).getOpcode() != ISD::SETCC)
+    return false;
+
+  SDValue WideRes =
+    getSETCCInGPR(N->getOperand(0),
+                  N->getOpcode() == ISD::SIGN_EXTEND ?
+                  SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig);
+
+  if (!WideRes)
+    return false;
+
+  SDLoc dl(N);
+  bool Inputs32Bit = N->getOperand(0).getOperand(0).getValueType() == MVT::i32;
+  bool Output32Bit = N->getValueType(0) == MVT::i32;
+
+  NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0;
+  NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1;
+
+  SDValue ConvOp = WideRes;
+  if (Inputs32Bit != Output32Bit)
+    ConvOp = addExtOrTrunc(WideRes, Inputs32Bit ? ExtOrTruncConversion::Ext :
+                           ExtOrTruncConversion::Trunc);
+  ReplaceNode(N, ConvOp.getNode());
+
+  return true;
+}
+
+/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue PPCDAGToDAGISel::signExtendInputIfNeeded(SDValue Input) {
+  assert(Input.getValueType() == MVT::i32 &&
+         "Can only sign-extend 32-bit values here.");
+  unsigned Opc = Input.getOpcode();
+
+  // The value was sign extended and then truncated to 32-bits. No need to
+  // sign extend it again.
+  if (Opc == ISD::TRUNCATE &&
+      (Input.getOperand(0).getOpcode() == ISD::AssertSext ||
+       Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND))
+    return Input;
+
+  LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+  // The input is a sign-extending load. No reason to sign-extend.
+  if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD)
+    return Input;
+
+  ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+  // We don't sign-extend constants and already sign-extended values.
+  if (InputConst || Opc == ISD::AssertSext || Opc == ISD::SIGN_EXTEND_INREG ||
+      Opc == ISD::SIGN_EXTEND)
+    return Input;
+
+  SDLoc dl(Input);
+  SignExtensionsAdded++;
+  return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32, dl, MVT::i32, Input), 0);
+}
+
+/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue PPCDAGToDAGISel::zeroExtendInputIfNeeded(SDValue Input) {
+  assert(Input.getValueType() == MVT::i32 &&
+         "Can only zero-extend 32-bit values here.");
+  LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+  unsigned Opc = Input.getOpcode();
+
+  // No need to zero-extend loaded values (unless they're loaded with
+  // a sign-extending load).
+  if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD)
+    return Input;
+
+  ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+  bool InputZExtConst = InputConst && InputConst->getSExtValue() >= 0;
+  // An ISD::TRUNCATE will be lowered to an EXTRACT_SUBREG so we have
+  // to conservatively actually clear the high bits. We also don't need to
+  // zero-extend constants or values that are already zero-extended.
+  if (InputZExtConst || Opc == ISD::AssertZext || Opc == ISD::ZERO_EXTEND)
+    return Input;
+
+  SDLoc dl(Input);
+  ZeroExtensionsAdded++;
+  return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32, dl, MVT::i32, Input,
+                                        getI64Imm(0, dl), getI64Imm(32, dl)),
+                 0);
+}
+
+// Handle a 32-bit value in a 64-bit register and vice-versa. These are of
+// course not actual zero/sign extensions that will generate machine code,
+// they're just a way to reinterpret a 32 bit value in a register as a
+// 64 bit value and vice-versa.
+SDValue PPCDAGToDAGISel::addExtOrTrunc(SDValue NatWidthRes,
+                                       ExtOrTruncConversion Conv) {
+  SDLoc dl(NatWidthRes);
+
+  // For reinterpreting 32-bit values as 64 bit values, we generate
+  // INSERT_SUBREG IMPLICIT_DEF:i64, <input>, TargetConstant:i32<1>
+  if (Conv == ExtOrTruncConversion::Ext) {
+    SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0);
+    SDValue SubRegIdx =
+      CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+    return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64,
+                                          ImDef, NatWidthRes, SubRegIdx), 0);
+  }
+
+  assert(Conv == ExtOrTruncConversion::Trunc &&
+         "Unknown convertion between 32 and 64 bit values.");
+  // For reinterpreting 64-bit values as 32-bit values, we just need to
+  // EXTRACT_SUBREG (i.e. extract the low word).
+  SDValue SubRegIdx =
+    CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+  return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32,
+                                        NatWidthRes, SubRegIdx), 0);
+}
+
+/// Produces a zero-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get32BitZExtCompare(SDValue LHS, SDValue RHS,
+                                             ISD::CondCode CC,
+                                             int64_t RHSValue, SDLoc dl) {
+  bool IsRHSZero = RHSValue == 0;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5)
+    // (zext (setcc %a, 0, seteq))  -> (lshr (cntlzw %a), 5)
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Clz =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+    SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl),
+      getI32Imm(31, dl) };
+    return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+                                          ShiftOps), 0);
+  }
+  }
+}
+
+/// Produces a sign-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS,
+                                             ISD::CondCode CC,
+                                             int64_t RHSValue, SDLoc dl) {
+  bool IsRHSZero = RHSValue == 0;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // (sext (setcc %a, %b, seteq)) ->
+    //   (ashr (shl (ctlz (xor %a, %b)), 58), 63)
+    // (sext (setcc %a, 0, seteq)) ->
+    //   (ashr (shl (ctlz %a), 58), 63)
+    SDValue CountInput = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Cntlzw =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0);
+    SDValue SHLOps[] = { Cntlzw, getI32Imm(58, dl), getI32Imm(0, dl) };
+    SDValue Sldi =
+      SDValue(CurDAG->getMachineNode(PPC::RLDICR_32, dl, MVT::i32, SHLOps), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::SRADI_32, dl, MVT::i32, Sldi,
+                                          getI32Imm(63, dl)), 0);
+  }
+  }
+}
+
+/// Returns an equivalent of a SETCC node but with the result the same width as
+/// the inputs. This can nalso be used for SELECT_CC if either the true or false
+/// values is a power of two while the other is zero.
+SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
+                                       SetccInGPROpts ConvOpts) {
+  assert((Compare.getOpcode() == ISD::SETCC ||
+          Compare.getOpcode() == ISD::SELECT_CC) &&
+         "An ISD::SETCC node required here.");
+
+  SDValue LHS = Compare.getOperand(0);
+  SDValue RHS = Compare.getOperand(1);
+
+  // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC.
+  int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2;
+  ISD::CondCode CC =
+    cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get();
+  EVT InputVT = LHS.getValueType();
+  if (InputVT != MVT::i32)
+    return SDValue();
+
+  SDLoc dl(Compare);
+  ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+  int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX;
+
+  if (ConvOpts == SetccInGPROpts::ZExtInvert ||
+      ConvOpts == SetccInGPROpts::SExtInvert)
+    CC = ISD::getSetCCInverse(CC, true);
+
+  if (ISD::isSignedIntSetCC(CC)) {
+    LHS = signExtendInputIfNeeded(LHS);
+    RHS = signExtendInputIfNeeded(RHS);
+  } else if (ISD::isUnsignedIntSetCC(CC)) {
+    LHS = zeroExtendInputIfNeeded(LHS);
+    RHS = zeroExtendInputIfNeeded(RHS);
+  }
+
+  bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig ||
+    ConvOpts == SetccInGPROpts::SExtInvert;
+  if (IsSext)
+    return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
+  return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+}
+
 void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
   // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
@@ -2508,6 +2757,12 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     }
     break;
 
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    if (tryEXTEND(N))
+      return;
+    break;
+
   case ISD::SETCC:
     if (trySETCC(N))
       return;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 685f24cb502e..17bdd595da10 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -923,6 +923,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
 
   // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
   if (Subtarget.hasFPCVT())
@@ -4949,8 +4952,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be moved somewhere else
@@ -5000,9 +5002,8 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
                                   Flags, DAG, dl);
 
       // This must go outside the CALLSEQ_START..END.
-      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                           CallSeqStart.getNode()->getOperand(1),
-                           SDLoc(MemcpyCall));
+      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
+                                                     SDLoc(MemcpyCall));
       DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                              NewCallSeqStart.getNode());
       Chain = CallSeqStart = NewCallSeqStart;
@@ -5083,9 +5084,9 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
                         CallSeqStart.getNode()->getOperand(0),
                         Flags, DAG, dl);
   // The MEMCPY must go outside the CALLSEQ_START..END.
-  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                             CallSeqStart.getNode()->getOperand(1),
-                             SDLoc(MemcpyCall));
+  int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
+  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
+                                                 SDLoc(MemcpyCall));
   DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                          NewCallSeqStart.getNode());
   return NewCallSeqStart;
@@ -5268,8 +5269,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain,
-                                 DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
@@ -5828,8 +5828,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
@@ -8741,9 +8740,9 @@ static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
 
 // The mappings for emitLeading/TrailingFence is taken from
 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
-                                         AtomicOrdering Ord, bool IsStore,
-                                         bool IsLoad) const {
+Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                                 Instruction *Inst,
+                                                 AtomicOrdering Ord) const {
   if (Ord == AtomicOrdering::SequentiallyConsistent)
     return callIntrinsic(Builder, Intrinsic::ppc_sync);
   if (isReleaseOrStronger(Ord))
@@ -8751,10 +8750,10 @@ Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
   return nullptr;
 }
 
-Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
-                                          AtomicOrdering Ord, bool IsStore,
-                                          bool IsLoad) const {
-  if (IsLoad && isAcquireOrStronger(Ord))
+Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                                  Instruction *Inst,
+                                                  AtomicOrdering Ord) const {
+  if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord))
     return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
   // FIXME: this is too conservative, a dependent branch + isync is enough.
   // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
@@ -11316,6 +11315,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
+  case ISD::SHL:
+    return combineSHL(N, DCI);
+  case ISD::SRA:
+    return combineSRA(N, DCI);
+  case ISD::SRL:
+    return combineSRL(N, DCI);
   case PPCISD::SHL:
     if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
         return N->getOperand(0);
@@ -12948,3 +12953,58 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
     return Imm.isPosZero();
   }
 }
+
+// For vector shift operation op, fold
+// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
+static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
+                                  SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarSizeInBits();
+  unsigned Opcode = N->getOpcode();
+  unsigned TargetOpcode;
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected shift operation");
+  case ISD::SHL:
+    TargetOpcode = PPCISD::SHL;
+    break;
+  case ISD::SRL:
+    TargetOpcode = PPCISD::SRL;
+    break;
+  case ISD::SRA:
+    TargetOpcode = PPCISD::SRA;
+    break;
+  }
+
+  if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
+      N1->getOpcode() == ISD::AND)
+    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
+      if (Mask->getZExtValue() == OpSizeInBits - 1)
+        return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
+  if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+    return Value;
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
+  if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+    return Value;
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
+  if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+    return Value;
+
+  return SDValue();
+}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 32661099b79d..4fc744257262 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -117,9 +117,13 @@ namespace llvm {
       /// at function entry, used for PIC code.
       GlobalBaseReg,
 
-      /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
-      /// shift amounts.  These nodes are generated by the multi-precision shift
-      /// code.
+      /// These nodes represent PPC shifts.
+      ///
+      /// For scalar types, only the last `n + 1` bits of the shift amounts
+      /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
+      /// for exact behaviors.
+      ///
+      /// For vector types, only the last n bits are used. See vsld.
       SRL, SRA, SHL,
 
       /// The combination of sra[wd]i and addze used to implemented signed
@@ -617,10 +621,10 @@ namespace llvm {
       return true;
     }
 
-    Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                                  bool IsStore, bool IsLoad) const override;
-    Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                                   bool IsStore, bool IsLoad) const override;
+    Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                  AtomicOrdering Ord) const override;
+    Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                   AtomicOrdering Ord) const override;
 
     MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr &MI,
@@ -999,6 +1003,9 @@ namespace llvm {
     SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
 
     /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
     /// SETCC with integer subtraction when (1) there is a legal way of doing it
@@ -1017,14 +1024,6 @@ namespace llvm {
     SDValue
     combineElementTruncationToVectorTruncation(SDNode *N,
                                                DAGCombinerInfo &DCI) const;
-
-    bool supportsModuloShift(ISD::NodeType Inst,
-                             EVT ReturnType) const override {
-      assert((Inst == ISD::SHL || Inst == ISD::SRA || Inst == ISD::SRL) &&
-             "Expect a shift instruction");
-      assert(isOperationLegal(Inst, ReturnType));
-      return ReturnType.isVector();
-    }
   };
 
   namespace PPC {
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 997b96ca6ec8..a8433919f0f3 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -634,10 +634,19 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS),
                              "extsw", "$rA, $rS", IIC_IntSimple,
                              [(set i64:$rA, (sext i32:$rS))]>, isPPC64;
+let isCodeGenOnly = 1 in
+def EXTSW_32 : XForm_11<31, 986, (outs gprc:$rA), (ins gprc:$rS),
+                        "extsw $rA, $rS", IIC_IntSimple,
+                        []>, isPPC64;
 
 defm SRADI  : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
                          "sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
                          [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
+// For fast-isel:
+let isCodeGenOnly = 1 in
+def SRADI_32  : XSForm_1<31, 413, (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH),
+                         "sradi $rA, $rS, $SH", IIC_IntRotateDI, []>, isPPC64;
+
 defm CNTLZD : XForm_11r<31,  58, (outs g8rc:$rA), (ins g8rc:$rS),
                         "cntlzd", "$rA, $rS", IIC_IntGeneral,
                         [(set i64:$rA, (ctlz i64:$rS))]>;
@@ -721,15 +730,26 @@ defm RLDICL : MDForm_1r<30, 0,
 // For fast-isel:
 let isCodeGenOnly = 1 in
 def RLDICL_32_64 : MDForm_1<30, 0,
-                           (outs g8rc:$rA),
-                           (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
-                           "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
-                           []>, isPPC64;
+                            (outs g8rc:$rA),
+                            (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                            "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                            []>, isPPC64;
 // End fast-isel.
+let isCodeGenOnly = 1 in
+def RLDICL_32 : MDForm_1<30, 0,
+                         (outs gprc:$rA),
+                         (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                         "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                         []>, isPPC64;
 defm RLDICR : MDForm_1r<30, 1,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                         []>, isPPC64;
+let isCodeGenOnly = 1 in
+def RLDICR_32 : MDForm_1<30, 1,
+                         (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                         "rldicr $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                         []>, isPPC64;
 defm RLDIC  : MDForm_1r<30, 2,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index c380766e9f5c..e14d18fd5433 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -987,6 +987,12 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSLH $vA, $vB))>;
 def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSLW $vA, $vB))>;
+def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSLB $vA, $vB))>;
+def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSLH $vA, $vB))>;
+def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSLW $vA, $vB))>;
 
 def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRB $vA, $vB))>;
@@ -994,6 +1000,12 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSRH $vA, $vB))>;
 def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSRW $vA, $vB))>;
+def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRB $vA, $vB))>;
+def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRH $vA, $vB))>;
+def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRW $vA, $vB))>;
 
 def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRAB $vA, $vB))>;
@@ -1001,6 +1013,12 @@ def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSRAH $vA, $vB))>;
 def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSRAW $vA, $vB))>;
+def : Pat<(v16i8 (PPCsra v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRAB $vA, $vB))>;
+def : Pat<(v8i16 (PPCsra v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRAH $vA, $vB))>;
+def : Pat<(v4i32 (PPCsra v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRAW $vA, $vB))>;
 
 // Float to integer and integer to float conversions
 def : Pat<(v4i32 (fp_to_sint v4f32:$vA)),
@@ -1072,14 +1090,24 @@ def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB),
 // Vector shifts
 def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>;
 def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                    "vsld $vD, $vA, $vB", IIC_VecGeneral,
-                    [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>;
+                    "vsld $vD, $vA, $vB", IIC_VecGeneral, []>;
 def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                   "vsrd $vD, $vA, $vB", IIC_VecGeneral,
-                   [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>;
+                   "vsrd $vD, $vA, $vB", IIC_VecGeneral, []>;
 def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                    "vsrad $vD, $vA, $vB", IIC_VecGeneral,
-                    [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>;
+                    "vsrad $vD, $vA, $vB", IIC_VecGeneral, []>;
+
+def : Pat<(v2i64 (shl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSLD $vA, $vB))>;
+def : Pat<(v2i64 (PPCshl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSLD $vA, $vB))>;
+def : Pat<(v2i64 (srl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRD $vA, $vB))>;
+def : Pat<(v2i64 (PPCsrl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRD $vA, $vB))>;
+def : Pat<(v2i64 (sra v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRAD $vA, $vB))>;
+def : Pat<(v2i64 (PPCsra v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRAD $vA, $vB))>;
 
 // Vector Integer Arithmetic Instructions
 let isCommutable = 1 in {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index f004ce49cac0..1af5e7f28342 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -33,7 +33,8 @@ def SDT_PPCVexts  : SDTypeProfile<1, 2, [
   SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
 ]>;
 
-def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                           SDTCisVT<1, i32> ]>;
 def SDT_PPCCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                          SDTCisVT<1, i32> ]>;
 def SDT_PPCvperm   : SDTypeProfile<1, 3, [
@@ -1099,9 +1100,11 @@ multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
 
 let hasCtrlDep = 1 in {
 let Defs = [R1], Uses = [R1] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
-                              [(callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+                              "#ADJCALLSTACKDOWN $amt1 $amt2",
+                              [(callseq_start timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+                              "#ADJCALLSTACKUP $amt1 $amt2",
                               [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
@@ -4163,6 +4166,8 @@ def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0
 def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
 def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
 def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+def : InstAlias<"clrldi $rA, $rS, $n",
+                (RLDICL_32 gprc:$rA, gprc:$rS, 0, u6imm:$n)>;
 def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
 
 def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b",
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 967557452f24..b98140fedfc0 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1436,7 +1436,7 @@ let Predicates = [IsISA3_0, HasDirectMove] in {
   def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
                               "mtvsrws $XT, $rA", IIC_VecGeneral, []>;
 
-  def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+  def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB),
                        "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
                        []>, Requires<[In64BitMode]>;
 
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 0c1260a2965b..c7aa4cb78b7a 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -99,7 +99,8 @@ protected:
         // Don't really need to save data to the stack - the clobbered
         // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
         // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
-        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0);
+        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
+                                                            .addImm(0);
 
         // Expand into two ops built prior to the existing instruction.
         MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index acb34d5baaa8..9e7e3c6b705a 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -773,8 +773,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
@@ -1165,8 +1164,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // Adjust the stack pointer to make room for the arguments.
   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
   // with more than 6 arguments.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
-                               DL);
+  Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
 
   // Collect the set of registers to pass to the function and their values.
   // This will be emitted as a sequence of CopyToReg nodes glued to the call
@@ -2058,7 +2056,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     SDValue Chain = DAG.getEntryNode();
     SDValue InFlag;
 
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, DL, true), DL);
+    Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
     Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
     InFlag = Chain.getValue(1);
     SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
@@ -3386,7 +3384,10 @@ SparcTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:  break;
-    case 'r': return C_RegisterClass;
+    case 'r':
+    case 'f':
+    case 'e':
+      return C_RegisterClass;
     case 'I': // SIMM13
       return C_Other;
     }
@@ -3465,6 +3466,24 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &SP::IntPairRegClass);
       else
         return std::make_pair(0U, &SP::IntRegsRegClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &SP::FPRegsRegClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, &SP::LowDFPRegsRegClass);
+      else if (VT == MVT::f128)
+        return std::make_pair(0U, &SP::LowQFPRegsRegClass);
+      llvm_unreachable("Unknown ValueType for f-register-type!");
+      break;
+    case 'e':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &SP::FPRegsRegClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, &SP::DFPRegsRegClass);
+      else if (VT == MVT::f128)
+        return std::make_pair(0U, &SP::QFPRegsRegClass);
+      llvm_unreachable("Unknown ValueType for e-register-type!");
+      break;
     }
   } else if (!Constraint.empty() && Constraint.size() <= 5
               && Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 5a19c624abb5..ae45c8be6752 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -195,7 +195,8 @@ def SPsjlj_longjmp: SDNode<"SPISD::EH_SJLJ_LONGJMP",
                            [SDNPHasChain, SDNPSideEffect]>;
 
 //  These are target-independent nodes, but have target-specific formats.
-def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32> ]>;
 def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                         SDTCisVT<1, i32> ]>;
 
@@ -404,9 +405,9 @@ let Defs = [O7] in {
 }
 
 let Defs = [O6], Uses = [O6] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                               "!ADJCALLSTACKDOWN $amt",
-                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                               "!ADJCALLSTACKDOWN $amt1, $amt2",
+                               [(callseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             "!ADJCALLSTACKUP $amt1",
                             [(callseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index 6ecfddfc7d66..6625eaafd992 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -346,11 +346,13 @@ def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
 
 // Floating point register classes.
 def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
-
 def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
-
 def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
 
+// The Low?FPRegs classes are used only for inline-asm constraints.
+def LowDFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>;
+def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>;
+
 // Floating point control register classes.
 def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
 
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 3f91ca9035a6..efcf6696fd50 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -262,6 +262,9 @@ public:
   bool isMemDisp20(MemoryKind MemKind, RegisterKind RegKind) const {
     return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287);
   }
+  bool isMemDisp12Len4(RegisterKind RegKind) const {
+    return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x10);
+  }
   bool isMemDisp12Len8(RegisterKind RegKind) const {
     return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x100);
   }
@@ -347,6 +350,7 @@ public:
   bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, ADDR64Reg); }
   bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, ADDR64Reg); }
   bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, ADDR64Reg); }
+  bool isBDLAddr64Disp12Len4() const { return isMemDisp12Len4(ADDR64Reg); }
   bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); }
   bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, ADDR64Reg); }
   bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, ADDR64Reg); }
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index a281a0aa6bcc..27fd70bc6092 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -327,6 +327,18 @@ static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeBDLAddr12Len4Operand(MCInst &Inst, uint64_t Field,
+                                               const unsigned *Regs) {
+  uint64_t Length = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Length < 16 && "Invalid BDLAddr12Len4");
+  Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::createImm(Disp));
+  Inst.addOperand(MCOperand::createImm(Length + 1));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeBDLAddr12Len8Operand(MCInst &Inst, uint64_t Field,
                                                const unsigned *Regs) {
   uint64_t Length = Field >> 16;
@@ -399,6 +411,13 @@ static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
   return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
 }
 
+static DecodeStatus decodeBDLAddr64Disp12Len4Operand(MCInst &Inst,
+                                                     uint64_t Field,
+                                                     uint64_t Address,
+                                                     const void *Decoder) {
+  return decodeBDLAddr12Len4Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
 static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst,
                                                      uint64_t Field,
                                                      uint64_t Address,
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index 092eb4011adc..d188f56512ab 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -77,6 +77,9 @@ private:
   uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const;
+  uint64_t getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
   uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
@@ -220,6 +223,17 @@ getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
 }
 
 uint64_t SystemZMCCodeEmitter::
+getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1;
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Len));
+  return (Len << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
 getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index 86a1322c9e23..74cf653b9d95 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -63,7 +63,7 @@ via a register.)
 
 --
 
-We don't use ICM or STCM.
+We don't use ICM, STCM, or CLM.
 
 --
 
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index 716e5add8051..7bfa378aa85c 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -68,6 +68,11 @@ def FeaturePopulationCount : SystemZFeature<
   "Assume that the population-count facility is installed"
 >;
 
+def FeatureMessageSecurityAssist4 : SystemZFeature<
+  "message-security-assist-extension4", "MessageSecurityAssist4",
+  "Assume that the message-security-assist extension facility 4 is installed"
+>;
+
 def Arch9NewFeatures : SystemZFeatureList<[
     FeatureDistinctOps,
     FeatureFastSerialization,
@@ -75,7 +80,8 @@ def Arch9NewFeatures : SystemZFeatureList<[
     FeatureHighWord,
     FeatureInterlockedAccess1,
     FeatureLoadStoreOnCond,
-    FeaturePopulationCount
+    FeaturePopulationCount,
+    FeatureMessageSecurityAssist4
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -133,6 +139,11 @@ def FeatureLoadStoreOnCond2 : SystemZFeature<
   "Assume that the load/store-on-condition facility 2 is installed"
 >;
 
+def FeatureMessageSecurityAssist5 : SystemZFeature<
+  "message-security-assist-extension5", "MessageSecurityAssist5",
+  "Assume that the message-security-assist extension facility 5 is installed"
+>;
+
 def FeatureVector : SystemZFeature<
   "vector", "Vector",
   "Assume that the vectory facility is installed"
@@ -142,6 +153,7 @@ def FeatureNoVector : SystemZMissingFeature<"Vector">;
 def Arch11NewFeatures : SystemZFeatureList<[
     FeatureLoadAndZeroRightmostByte,
     FeatureLoadStoreOnCond2,
+    FeatureMessageSecurityAssist5,
     FeatureVector
 ]>;
 
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 6989aabb8c6a..235e095f0010 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1110,9 +1110,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Mark the start of the call.
   if (!IsTailCall)
-    Chain = DAG.getCALLSEQ_START(Chain,
-                                 DAG.getConstant(NumBytes, DL, PtrVT, true),
-                                 DL);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   // Copy argument values to their designated locations.
   SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
@@ -6354,3 +6352,12 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
     llvm_unreachable("Unexpected instr type to insert");
   }
 }
+
+// This is only used by the isel schedulers, and is needed only to prevent
+// compiler from crashing when list-ilp is used.
+const TargetRegisterClass *
+SystemZTargetLowering::getRepRegClassFor(MVT VT) const {
+  if (VT == MVT::Untyped)
+    return &SystemZ::ADDR128BitRegClass;
+  return TargetLowering::getRepRegClassFor(VT);
+}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 1c34dc43e8bb..79c8c4d92669 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -590,6 +590,8 @@ private:
   MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI,
                                          MachineBasicBlock *MBB,
                                          unsigned Opcode) const;
+
+  const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index bb6d27e24828..364b81f98eed 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -458,6 +458,12 @@ def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>;
 def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>;
 def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
 
+// Divide to integer.
+let Defs = [CC] in {
+  def DIEBR : TernaryRRFb<"diebr", 0xB353, FP32, FP32, FP32>;
+  def DIDBR : TernaryRRFb<"didbr", 0xB35B, FP64, FP64, FP64>;
+}
+
 //===----------------------------------------------------------------------===//
 // Comparisons
 //===----------------------------------------------------------------------===//
@@ -469,6 +475,13 @@ let Defs = [CC], CCValues = 0xF in {
 
   def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>;
   def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
+
+  def KEBR : CompareRRE<"kebr", 0xB308, null_frag, FP32,  FP32>;
+  def KDBR : CompareRRE<"kdbr", 0xB318, null_frag, FP64,  FP64>;
+  def KXBR : CompareRRE<"kxbr", 0xB348, null_frag, FP128, FP128>;
+
+  def KEB : CompareRXE<"keb", 0xED08, null_frag, FP32, load, 4>;
+  def KDB : CompareRXE<"kdb", 0xED18, null_frag, FP64, load, 8>;
 }
 
 // Test Data Class.
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index c727f486087e..a37da2807854 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -710,6 +710,21 @@ class InstRSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-0}  = RI2;
 }
 
+class InstRSLa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<20> BDL1;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = BDL1{19-16};
+  let Inst{35-32} = 0;
+  let Inst{31-16} = BDL1{15-0};
+  let Inst{15-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstRSYa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -817,6 +832,37 @@ class InstSSa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-0}  = BD2;
 }
 
+class InstSSb<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<20> BDL1;
+  bits<20> BDL2;
+
+  let Inst{47-40} = op;
+  let Inst{39-36} = BDL1{19-16};
+  let Inst{35-32} = BDL2{19-16};
+  let Inst{31-16} = BDL1{15-0};
+  let Inst{15-0}  = BDL2{15-0};
+}
+
+class InstSSc<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<20> BDL1;
+  bits<16> BD2;
+  bits<4> I3;
+
+  let Inst{47-40} = op;
+  let Inst{39-36} = BDL1{19-16};
+  let Inst{35-32} = I3;
+  let Inst{31-16} = BDL1{15-0};
+  let Inst{15-0}  = BD2;
+}
+
 class InstSSd<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -850,6 +896,20 @@ class InstSSe<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-0}  = BD4;
 }
 
+class InstSSf<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<16> BD1;
+  bits<24> BDL2;
+
+  let Inst{47-40} = op;
+  let Inst{39-32} = BDL2{23-16};
+  let Inst{31-16} = BD1;
+  let Inst{15-0}  = BDL2{15-0};
+}
+
 class InstSSE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -1567,6 +1627,9 @@ class ICV<string name>
 //   Inherent:
 //     One register output operand and no input operands.
 //
+//   InherentDual:
+//     Two register output operands and no input operands.
+//
 //   StoreInherent:
 //     One address operand.  The instruction stores to the address.
 //
@@ -1642,8 +1705,9 @@ class ICV<string name>
 //     Two input operands and an implicit CC output operand.
 //
 //   Test:
-//     Two input operands and an implicit CC output operand.  The second
-//     input operand is an "address" operand used as a test class mask.
+//     One or two input operands and an implicit CC output operand.  If
+//     present, the second input operand is an "address" operand used as
+//     a test class mask.
 //
 //   Ternary:
 //     One register output operand and three input operands.
@@ -1691,6 +1755,10 @@ class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
   let R2 = 0;
 }
 
+class InherentDualRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRRE<opcode, (outs cls:$R1, cls:$R2), (ins),
+            mnemonic#"\t$R1, $R2", []>;
+
 class InherentVRIa<string mnemonic, bits<16> opcode, bits<16> value>
   : InstVRIa<opcode, (outs VR128:$V1), (ins), mnemonic#"\t$V1", []> {
   let I2 = value;
@@ -1714,6 +1782,12 @@ class SideEffectInherentS<string mnemonic, bits<16> opcode,
   let BD2 = 0;
 }
 
+class SideEffectInherentRRE<string mnemonic, bits<16> opcode>
+  : InstRRE<opcode, (outs), (ins), mnemonic, []> {
+  let R1 = 0;
+  let R2 = 0;
+}
+
 // Allow an optional TLS marker symbol to generate TLS call relocations.
 class CallRI<string mnemonic, bits<12> opcode>
   : InstRIb<opcode, (outs), (ins GR64:$R1, brtarget16tls:$RI2),
@@ -2084,6 +2158,13 @@ multiclass LoadMultipleRSPair<string mnemonic, bits<8> rsOpcode,
   }
 }
 
+class LoadMultipleSSe<string mnemonic, bits<8> opcode, RegisterOperand cls>
+  : InstSSe<opcode, (outs cls:$R1, cls:$R3),
+            (ins bdaddr12only:$BD2, bdaddr12only:$BD4),
+            mnemonic#"\t$R1, $R3, $BD2, $BD4", []> {
+  let mayLoad = 1;
+}
+
 class LoadMultipleVRSa<string mnemonic, bits<16> opcode>
   : InstVRSa<opcode, (outs VR128:$V1, VR128:$V3), (ins bdaddr12only:$BD2),
              mnemonic#"\t$V1, $V3, $BD2", []> {
@@ -2355,6 +2436,15 @@ class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let OpType = "reg";
 }
 
+class UnaryMemRRFc<string mnemonic, bits<16> opcode,
+                   RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src),
+            mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let M3 = 0;
+}
+
 class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
               RegisterOperand cls, Immediate imm>
   : InstRIa<opcode, (outs cls:$R1), (ins imm:$I2),
@@ -2585,11 +2675,61 @@ class SideEffectBinaryIE<string mnemonic, bits<16> opcode,
   : InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2),
            mnemonic#"\t$I1, $I2", []>;
 
+class SideEffectBinarySI<string mnemonic, bits<8> opcode, Operand imm>
+  : InstSI<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2", []>;
+
 class SideEffectBinarySIL<string mnemonic, bits<16> opcode,
                           SDPatternOperator operator, Immediate imm>
   : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2", [(operator bdaddr12only:$BD1, imm:$I2)]>;
 
+class SideEffectBinarySSa<string mnemonic, bits<8> opcode>
+  : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1, bdaddr12only:$BD2),
+            mnemonic##"\t$BDL1, $BD2", []>;
+
+class SideEffectBinarySSb<string mnemonic, bits<8> opcode>
+  : InstSSb<opcode,
+            (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
+            mnemonic##"\t$BDL1, $BDL2", []>;
+
+class SideEffectBinarySSf<string mnemonic, bits<8> opcode>
+  : InstSSf<opcode, (outs), (ins bdaddr12only:$BD1, bdladdr12onlylen8:$BDL2),
+            mnemonic##"\t$BD1, $BDL2", []>;
+
+class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode,
+                               RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+           mnemonic#"\t$R1, $R2", []> {
+    let Constraints = "$R1 = $R1src, $R2 = $R2src";
+    let DisableEncoding = "$R1src, $R2src";
+}
+
+class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode,
+                             RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls2:$R2), (ins cls1:$R1, cls2:$R2src),
+            mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R2 = $R2src";
+  let DisableEncoding = "$R2src";
+}
+
+class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode,
+                                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+            mnemonic#"\t$R1, $R2", []> {
+    let Constraints = "$R1 = $R1src, $R2 = $R2src";
+    let DisableEncoding = "$R1src, $R2src";
+}
+
+class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode,
+                                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+             mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src, $R2 = $R2src";
+  let DisableEncoding = "$R1src, $R2src";
+  let M3 = 0;
+}
+
 class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                RegisterOperand cls1, RegisterOperand cls2>
   : InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
@@ -2654,6 +2794,20 @@ class BinaryRRFb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M4 = 0;
 }
 
+class BinaryMemRRFc<string mnemonic, bits<16> opcode,
+                    RegisterOperand cls1, RegisterOperand cls2, Immediate imm>
+  : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3),
+            mnemonic#"\t$R1, $R2, $M3", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+multiclass BinaryMemRRFcOpt<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls1, RegisterOperand cls2> {
+  def "" : BinaryMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+  def Opt : UnaryMemRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
 class BinaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                 RegisterOperand cls2>
   : InstRRFe<opcode, (outs cls1:$R1), (ins imm32zx4:$M3, cls2:$R2),
@@ -3112,6 +3266,34 @@ class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let AccessBytes = bytes;
 }
 
+class StoreBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                    bits<5> bytes, AddressingMode mode = bdaddr12only>
+  : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+            mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                     bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+             mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass StoreBinaryRSPair<string mnemonic, bits<8> rsOpcode,
+                             bits<16> rsyOpcode, RegisterOperand cls,
+                             bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : StoreBinaryRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : StoreBinaryRSY<mnemonic#"y", rsyOpcode, cls, bytes,
+                              bdaddr20pair>;
+  }
+}
+
 class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
                      Immediate index>
   : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
@@ -3237,6 +3419,40 @@ multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
   }
 }
 
+class CompareRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                bits<5> bytes, AddressingMode mode = bdaddr12only>
+  : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+            mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class CompareRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                 bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+             mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass CompareRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+                         RegisterOperand cls, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : CompareRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : CompareRSY<mnemonic#"y", rsyOpcode, cls, bytes, bdaddr20pair>;
+  }
+}
+
+class CompareSSb<string mnemonic, bits<8> opcode>
+  : InstSSb<opcode,
+            (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
+            mnemonic##"\t$BDL1, $BDL2", []> {
+  let isCompare = 1;
+  let mayLoad = 1;
+}
+
 class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 SDPatternOperator load, Immediate imm,
                 AddressingMode mode = bdaddr12only>
@@ -3313,18 +3529,68 @@ class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M3 = 0;
 }
 
+class TestRSL<string mnemonic, bits<16> opcode>
+  : InstRSLa<opcode, (outs), (ins bdladdr12onlylen4:$BDL1),
+             mnemonic#"\t$BDL1", []> {
+  let mayLoad = 1;
+}
+
+class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
+  : InstSSc<opcode, (outs), (ins bdladdr12onlylen4:$BDL1,
+                                 shift12only:$BD2, imm32zx4:$I3),
+            mnemonic##"\t$BDL1, $BD2, $I3", []>;
+
+class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode,
+                                     RegisterOperand cls1,
+                                     RegisterOperand cls2,
+                                     RegisterOperand cls3>
+  : InstRRFb<opcode, (outs cls1:$R1, cls2:$R2, cls3:$R3),
+             (ins cls1:$R1src, cls2:$R2src, cls3:$R3src),
+             mnemonic#"\t$R1, $R3, $R2", []> {
+  let Constraints = "$R1 = $R1src, $R2 = $R2src, $R3 = $R3src";
+  let DisableEncoding = "$R1src, $R2src, $R3src";
+  let M4 = 0;
+}
+
 class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode,
                             RegisterOperand cls1, RegisterOperand cls2,
                             Immediate imm>
   : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3),
              mnemonic#"\t$R1, $R2, $M3", []>;
 
+class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode,
+                                  RegisterOperand cls1, RegisterOperand cls2,
+                                  Immediate imm>
+  : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2),
+             (ins cls1:$R1src, cls2:$R2src, imm:$M3),
+             mnemonic#"\t$R1, $R2, $M3", []> {
+  let Constraints = "$R1 = $R1src, $R2 = $R2src";
+  let DisableEncoding = "$R1src, $R2src";
+}
+
+multiclass SideEffectTernaryMemMemRRFcOpt<string mnemonic, bits<16> opcode,
+                                          RegisterOperand cls1,
+                                          RegisterOperand cls2> {
+  def "" : SideEffectTernaryMemMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+  def Opt : SideEffectBinaryMemMemRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
 class SideEffectTernarySSF<string mnemonic, bits<12> opcode,
                            RegisterOperand cls>
   : InstSSF<opcode, (outs),
             (ins bdaddr12only:$BD1, bdaddr12only:$BD2, cls:$R3),
             mnemonic#"\t$BD1, $BD2, $R3", []>;
 
+class TernaryRRFb<string mnemonic, bits<16> opcode,
+                  RegisterOperand cls1, RegisterOperand cls2,
+                  RegisterOperand cls3>
+  : InstRRFb<opcode, (outs cls1:$R1, cls3:$R3),
+             (ins cls1:$R1src, cls2:$R2, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R3, $R2, $M4", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
 class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                   RegisterOperand cls2>
   : InstRRFe<opcode, (outs cls1:$R1),
@@ -3376,6 +3642,24 @@ multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
   }
 }
 
+class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode,
+                                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSa<opcode, (outs cls1:$R1, cls2:$R3),
+            (ins cls1:$R1src, cls2:$R3src, shift12only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
+    let Constraints = "$R1 = $R1src, $R3 = $R3src";
+    let DisableEncoding = "$R1src, $R3src";
+}
+
+class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode,
+                                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSYa<opcode, (outs cls1:$R1, cls2:$R3),
+             (ins cls1:$R1src, cls2:$R3src, shift20only:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2", []> {
+    let Constraints = "$R1 = $R1src, $R3 = $R3src";
+    let DisableEncoding = "$R1src, $R3src";
+}
+
 class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
   : InstRXF<opcode, (outs cls:$R1),
@@ -3981,9 +4265,7 @@ class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
 // another instruction to handle the excess.
 multiclass MemorySS<string mnemonic, bits<8> opcode,
                     SDPatternOperator sequence, SDPatternOperator loop> {
-  def "" : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
-                                        bdaddr12only:$BD2),
-                   mnemonic##"\t$BDL1, $BD2", []>;
+  def "" : SideEffectBinarySSa<mnemonic, opcode>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
     def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
                                        imm64:$length),
@@ -4003,13 +4285,8 @@ multiclass MemorySS<string mnemonic, bits<8> opcode,
 // the full loop (the main instruction plus the branch on CC==3).
 multiclass StringRRE<string mnemonic, bits<16> opcode,
                      SDPatternOperator operator> {
-  def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
-                   (ins GR64:$R1src, GR64:$R2src),
-                   mnemonic#"\t$R1, $R2", []> {
-    let Uses = [R0L];
-    let Constraints = "$R1 = $R1src, $R2 = $R2src";
-    let DisableEncoding = "$R1src, $R2src";
-  }
+  let Uses = [R0L] in
+    def "" : SideEffectBinaryMemMemRRE<mnemonic, opcode, GR64, GR64>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
     def Loop : Pseudo<(outs GR64:$end),
                       (ins GR64:$start1, GR64:$start2, GR32:$char),
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index d63525f29412..fa5ecdd85243 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 let hasNoSchedulingInfo = 1 in {
-  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
-                                [(callseq_start timm:$amt)]>;
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                                [(callseq_start timm:$amt1, timm:$amt2)]>;
   def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                                 [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
@@ -464,6 +464,11 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
 // Memory-to-memory moves.
 let mayLoad = 1, mayStore = 1 in
   defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  def MVCL  : SideEffectBinaryMemMemRR<"mvcl", 0x0E, GR128, GR128>;
+  def MVCLE : SideEffectTernaryMemMemRS<"mvcle", 0xA8, GR128, GR128>;
+  def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
+}
 
 // String moves.
 let mayLoad = 1, mayStore = 1, Defs = [CC] in
@@ -707,6 +712,10 @@ def  : StoreGR64PC<STHRL, aligned_truncstorei16>;
 defm : StoreGR64Pair<ST, STY, truncstorei32>;
 def  : StoreGR64PC<STRL, aligned_truncstorei32>;
 
+// Store characters under mask -- not (yet) used for codegen.
+defm STCM : StoreBinaryRSPair<"stcm", 0xBE, 0xEB2D, GR32, 0>;
+def STCMH : StoreBinaryRSY<"stcmh", 0xEB2C, GRH32, 0>;
+
 //===----------------------------------------------------------------------===//
 // Multi-register moves
 //===----------------------------------------------------------------------===//
@@ -715,6 +724,7 @@ def  : StoreGR64PC<STRL, aligned_truncstorei32>;
 defm LM : LoadMultipleRSPair<"lm", 0x98, 0xEB98, GR32>;
 def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>;
 def LMH : LoadMultipleRSY<"lmh", 0xEB96, GRH32>;
+def LMD : LoadMultipleSSe<"lmd", 0xEF, GR64>;
 
 // Multi-register stores.
 defm STM : StoreMultipleRSPair<"stm", 0x90, 0xEB90, GR32>;
@@ -742,6 +752,10 @@ def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>;
 def STRV  : StoreRXY<"strv",  0xE33E, z_strv,  GR32, 4>;
 def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>;
 
+// Byte-swapping memory-to-memory moves.
+let mayLoad = 1, mayStore = 1 in
+  def MVCIN : SideEffectBinarySSa<"mvcin", 0xE8>;
+
 //===----------------------------------------------------------------------===//
 // Load address instructions
 //===----------------------------------------------------------------------===//
@@ -816,6 +830,7 @@ defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
 defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
 defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
 
+// Insert characters under mask -- not (yet) used for codegen.
 let Defs = [CC] in {
   defm ICM : TernaryRSPair<"icm", 0xBF, 0xEB81, GR32, 0>;
   def ICMH : TernaryRSY<"icmh", 0xEB80, GRH32, 0>;
@@ -919,6 +934,10 @@ let Defs = [CC] in {
   defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
   def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
   def  ALG  : BinaryRXY<"alg",  0xE30A, addc, GR64, load, 8>;
+
+  // Addition to memory.
+  def ALSI  : BinarySIY<"alsi",  0xEB6E, null_frag, imm32sx8>;
+  def ALGSI : BinarySIY<"algsi", 0xEB7E, null_frag, imm64sx8>;
 }
 defm : ZXB<addc, GR64, ALGFR>;
 
@@ -1166,9 +1185,14 @@ def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
 def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
 
 // Multiplication of a register, producing two results.
+def MR   : BinaryRR <"mr",   0x1C,   null_frag, GR128, GR32>;
+def MLR  : BinaryRRE<"mlr",  0xB996, null_frag, GR128, GR32>;
 def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>;
 
 // Multiplication of memory, producing two results.
+def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, load, 4>;
+def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>;
+def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, load, 4>;
 def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
 
 //===----------------------------------------------------------------------===//
@@ -1177,12 +1201,14 @@ def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
 
 let hasSideEffects = 1 in {  // Do not speculatively execute.
   // Division and remainder, from registers.
+  def DR    : BinaryRR <"dr",    0x1D,   null_frag,   GR128, GR32>;
   def DSGFR : BinaryRRE<"dsgfr", 0xB91D, z_sdivrem32, GR128, GR32>;
   def DSGR  : BinaryRRE<"dsgr",  0xB90D, z_sdivrem64, GR128, GR64>;
   def DLR   : BinaryRRE<"dlr",   0xB997, z_udivrem32, GR128, GR32>;
   def DLGR  : BinaryRRE<"dlgr",  0xB987, z_udivrem64, GR128, GR64>;
 
   // Division and remainder, from memory.
+  def D    : BinaryRX <"d",    0x5D,   null_frag,   GR128, load, 4>;
   def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>;
   def DSG  : BinaryRXY<"dsg",  0xE30D, z_sdivrem64, GR128, load, 8>;
   def DL   : BinaryRXY<"dl",   0xE397, z_udivrem32, GR128, load, 4>;
@@ -1193,23 +1219,32 @@ let hasSideEffects = 1 in {  // Do not speculatively execute.
 // Shifts
 //===----------------------------------------------------------------------===//
 
-// Shift left.
+// Logical shift left.
 let hasSideEffects = 0 in {
   defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
-  defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
   def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
+  def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>;
+}
+
+// Arithmetic shift left.
+let Defs = [CC] in {
+  defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
+  def SLAG : BinaryRSY<"slag", 0xEB0B, null_frag, GR64>;
+  def SLDA : BinaryRS<"slda", 0x8F, null_frag, GR128>;
 }
 
 // Logical shift right.
 let hasSideEffects = 0 in {
   defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
   def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
+  def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>;
 }
 
 // Arithmetic shift right.
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
   defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>;
   def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>;
+  def SRDA : BinaryRS<"srda", 0x8E, null_frag, GR128>;
 }
 
 // Rotate left.
@@ -1351,8 +1386,12 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
 defm : ZXB<z_ucmp, GR64, CLGFR>;
 
 // Memory-to-memory comparison.
-let mayLoad = 1, Defs = [CC] in
+let mayLoad = 1, Defs = [CC] in {
   defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+  def CLCL  : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>;
+  def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>;
+  def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>;
+}
 
 // String comparison.
 let mayLoad = 1, Defs = [CC] in
@@ -1381,6 +1420,12 @@ let Defs = [CC] in {
 def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>;
 def TMH : InstAlias<"tmh\t$R, $I", (TMLH GR32:$R, imm32lh16:$I), 0>;
 
+// Compare logical characters under mask -- not (yet) used for codegen.
+let Defs = [CC] in {
+  defm CLM : CompareRSPair<"clm", 0xBD, 0xEB21, GR32, 0>;
+  def CLMH : CompareRSY<"clmh", 0xEB20, GRH32, 0>;
+}
+
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
@@ -1581,6 +1626,115 @@ let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
 }
 
 //===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1, mayStore = 1 in
+  def TR : SideEffectBinarySSa<"tr", 0xDC>;
+
+let mayLoad = 1, Defs = [CC, R0L, R1D] in {
+  def TRT  : SideEffectBinarySSa<"trt", 0xDD>;
+  def TRTR : SideEffectBinarySSa<"trtr", 0xD0>;
+}
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L] in
+  def TRE : SideEffectBinaryMemMemRRE<"tre", 0xB2A5, GR128, GR64>;
+
+let mayLoad = 1, Uses = [R1D], Defs = [CC] in {
+  defm TRTE  : BinaryMemRRFcOpt<"trte",  0xB9BF, GR128, GR64>;
+  defm TRTRE : BinaryMemRRFcOpt<"trtre", 0xB9BD, GR128, GR64>;
+}
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
+  defm TROO : SideEffectTernaryMemMemRRFcOpt<"troo", 0xB993, GR128, GR64>;
+  defm TROT : SideEffectTernaryMemMemRRFcOpt<"trot", 0xB992, GR128, GR64>;
+  defm TRTO : SideEffectTernaryMemMemRRFcOpt<"trto", 0xB991, GR128, GR64>;
+  defm TRTT : SideEffectTernaryMemMemRRFcOpt<"trtt", 0xB990, GR128, GR64>;
+}
+
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  defm CU12 : SideEffectTernaryMemMemRRFcOpt<"cu12", 0xB2A7, GR128, GR128>;
+  defm CU14 : SideEffectTernaryMemMemRRFcOpt<"cu14", 0xB9B0, GR128, GR128>;
+  defm CU21 : SideEffectTernaryMemMemRRFcOpt<"cu21", 0xB2A6, GR128, GR128>;
+  defm CU24 : SideEffectTernaryMemMemRRFcOpt<"cu24", 0xB9B1, GR128, GR128>;
+  def  CU41 : SideEffectBinaryMemMemRRE<"cu41", 0xB9B2, GR128, GR128>;
+  def  CU42 : SideEffectBinaryMemMemRRE<"cu42", 0xB9B3, GR128, GR128>;
+
+  let isAsmParserOnly = 1 in {
+    defm CUUTF : SideEffectTernaryMemMemRRFcOpt<"cuutf", 0xB2A6, GR128, GR128>;
+    defm CUTFU : SideEffectTernaryMemMemRRFcOpt<"cutfu", 0xB2A7, GR128, GR128>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
+  def KM  : SideEffectBinaryMemMemRRE<"km",  0xB92E, GR128, GR128>;
+  def KMC : SideEffectBinaryMemMemRRE<"kmc", 0xB92F, GR128, GR128>;
+
+  def KIMD : SideEffectBinaryMemRRE<"kimd", 0xB93E, GR64, GR128>;
+  def KLMD : SideEffectBinaryMemRRE<"klmd", 0xB93F, GR64, GR128>;
+  def KMAC : SideEffectBinaryMemRRE<"kmac", 0xB91E, GR64, GR128>;
+
+  let Predicates = [FeatureMessageSecurityAssist4] in {
+    def KMF   : SideEffectBinaryMemMemRRE<"kmf", 0xB92A, GR128, GR128>;
+    def KMO   : SideEffectBinaryMemMemRRE<"kmo", 0xB92B, GR128, GR128>;
+    def KMCTR : SideEffectTernaryMemMemMemRRFb<"kmctr", 0xB92D,
+                                               GR128, GR128, GR128>;
+    def PCC   : SideEffectInherentRRE<"pcc", 0xB92C>;
+  }
+  let Predicates = [FeatureMessageSecurityAssist5] in
+    def PPNO  : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+defm CVB  : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, load, 4>;
+def  CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, load, 8>;
+
+defm CVD  : StoreRXPair<"cvd", 0x4E, 0xE326, null_frag, GR32, 4>;
+def  CVDG : StoreRXY<"cvdg", 0xE32E, null_frag, GR64, 8>;
+
+let mayLoad = 1, mayStore = 1 in {
+  def MVN : SideEffectBinarySSa<"mvn", 0xD1>;
+  def MVZ : SideEffectBinarySSa<"mvz", 0xD3>;
+  def MVO : SideEffectBinarySSb<"mvo", 0xF1>;
+
+  def PACK : SideEffectBinarySSb<"pack", 0xF2>;
+  def PKA  : SideEffectBinarySSf<"pka", 0xE9>;
+  def PKU  : SideEffectBinarySSf<"pku", 0xE1>;
+  def UNPK : SideEffectBinarySSb<"unpk", 0xF3>;
+  let Defs = [CC] in {
+    def UNPKA : SideEffectBinarySSa<"unpka", 0xEA>;
+    def UNPKU : SideEffectBinarySSa<"unpku", 0xE2>;
+  }
+}
+
+let mayLoad = 1, mayStore = 1 in {
+  let Defs = [CC] in {
+    def AP : SideEffectBinarySSb<"ap", 0xFA>;
+    def SP : SideEffectBinarySSb<"sp", 0xFB>;
+    def ZAP : SideEffectBinarySSb<"zap", 0xF8>;
+    def SRP : SideEffectTernarySSc<"srp", 0xF0>;
+  }
+  def MP : SideEffectBinarySSb<"mp", 0xFC>;
+  def DP : SideEffectBinarySSb<"dp", 0xFD>;
+  let Defs = [CC] in {
+    def ED : SideEffectBinarySSa<"ed", 0xDE>;
+    def EDMK : SideEffectBinarySSa<"edmk", 0xDF>;
+  }
+}
+
+let Defs = [CC] in {
+  def CP : CompareSSb<"cp", 0xF9>;
+  def TP : TestRSL<"tp", 0xEBC0>;
+}
+
+//===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
@@ -1712,12 +1866,39 @@ let usesCustomInserter = 1 in {
 
 // Search a block of memory for a character.
 let mayLoad = 1, Defs = [CC] in
-  defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+  defm SRST : StringRRE<"srst", 0xB25E, z_search_string>;
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+  def SRSTU : SideEffectBinaryMemMemRRE<"srstu", 0xB9BE, GR64, GR64>;
+
+// Compare until substring equal.
+let mayLoad = 1, Defs = [CC], Uses = [R0L, R1L] in
+  def CUSE : SideEffectBinaryMemMemRRE<"cuse", 0xB257, GR128, GR128>;
+
+// Compare and form codeword.
+let mayLoad = 1, Defs = [CC, R1D, R2D, R3D], Uses = [R1D, R2D, R3D] in
+  def CFC : SideEffectAddressS<"cfc", 0xB21A, null_frag>;
+
+// Update tree.
+let mayLoad = 1, mayStore = 1, Defs = [CC, R0D, R1D, R2D, R3D, R5D],
+    Uses = [R0D, R1D, R2D, R3D, R4D, R5D] in
+  def UPT : SideEffectInherentE<"upt", 0x0102>;
+
+// Checksum.
+let mayLoad = 1, Defs = [CC] in
+  def CKSM : SideEffectBinaryMemMemRRE<"cksm", 0xB241, GR64, GR128>;
+
+// Compression call.
+let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in
+  def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>;
 
 // Supervisor call.
 let hasSideEffects = 1, isCall = 1, Defs = [CC] in
   def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
 
+// Monitor call.
+let hasSideEffects = 1, isCall = 1 in
+  def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>;
+
 // Store clock.
 let hasSideEffects = 1, Defs = [CC] in {
   def STCK  : StoreInherentS<"stck",  0xB205, null_frag, 8>;
@@ -1729,10 +1910,18 @@ let hasSideEffects = 1, Defs = [CC] in {
 let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
   def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
 
+// Extract CPU attribute.
+let hasSideEffects = 1 in
+  def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>;
+
 // Extract CPU time.
 let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in
   def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
 
+// Extract PSW.
+let hasSideEffects = 1, Uses = [CC] in
+  def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>;
+
 // Execute.
 let hasSideEffects = 1 in {
   def EX   : SideEffectBinaryRX<"ex", 0x44, GR64>;
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 7bb4fe5afb3f..713612129d90 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -531,6 +531,7 @@ def BDAddr64Disp12      : AddressAsmOperand<"BDAddr",   "64", "12">;
 def BDAddr64Disp20      : AddressAsmOperand<"BDAddr",   "64", "20">;
 def BDXAddr64Disp12     : AddressAsmOperand<"BDXAddr",  "64", "12">;
 def BDXAddr64Disp20     : AddressAsmOperand<"BDXAddr",  "64", "20">;
+def BDLAddr64Disp12Len4 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len4">;
 def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len8">;
 def BDRAddr64Disp12     : AddressAsmOperand<"BDRAddr",  "64", "12">;
 def BDVAddr64Disp12     : AddressAsmOperand<"BDVAddr",  "64", "12">;
@@ -578,6 +579,7 @@ def bdxaddr20pair     : BDXMode<"BDXAddr",  "64", "20", "Pair">;
 def dynalloc12only    : BDXMode<"DynAlloc", "64", "12", "Only">;
 def laaddr12pair      : BDXMode<"LAAddr",   "64", "12", "Pair">;
 def laaddr20pair      : BDXMode<"LAAddr",   "64", "20", "Pair">;
+def bdladdr12onlylen4 : BDLMode<"BDLAddr",  "64", "12", "Only", "4">;
 def bdladdr12onlylen8 : BDLMode<"BDLAddr",  "64", "12", "Only", "8">;
 def bdraddr12only     : BDRMode<"BDRAddr",  "64", "12", "Only">;
 def bdvaddr12only     : BDVMode<            "64", "12">;
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index fde26ed4e1c5..adfc69c5d4cf 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -10,7 +10,8 @@
 //===----------------------------------------------------------------------===//
 // Type profiles
 //===----------------------------------------------------------------------===//
-def SDT_CallSeqStart        : SDCallSeqStart<[SDTCisVT<0, i64>]>;
+def SDT_CallSeqStart        : SDCallSeqStart<[SDTCisVT<0, i64>,
+                                              SDTCisVT<1, i64>]>;
 def SDT_CallSeqEnd          : SDCallSeqEnd<[SDTCisVT<0, i64>,
                                             SDTCisVT<1, i64>]>;
 def SDT_ZCall               : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td
index dbba8ab42b5a..1ce0168f95e9 100644
--- a/lib/Target/SystemZ/SystemZSchedule.td
+++ b/lib/Target/SystemZ/SystemZSchedule.td
@@ -56,12 +56,16 @@ def LSU_lat1    : SchedWrite;
 // Floating point unit (zEC12 and earlier)
 def FPU  : SchedWrite;
 def FPU2 : SchedWrite;
+def DFU  : SchedWrite;
+def DFU2 : SchedWrite;
 
 // Vector sub units (z13)
 def VecBF     : SchedWrite;
 def VecBF2    : SchedWrite;
 def VecDF     : SchedWrite;
 def VecDF2    : SchedWrite;
+def VecDFX    : SchedWrite;
+def VecDFX2   : SchedWrite;
 def VecFPd    : SchedWrite; // Blocking BFP div/sqrt unit.
 def VecMul    : SchedWrite;
 def VecStr    : SchedWrite;
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index 7aee6f52e9a7..612c3b6cf96e 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -76,6 +76,8 @@ def : WriteRes<VecBF,   [Z13_VecUnit]> { let Latency = 8; }
 def : WriteRes<VecBF2,  [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
 def : WriteRes<VecDF,   [Z13_VecUnit]> { let Latency = 8; }
 def : WriteRes<VecDF2,  [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDFX,  [Z13_VecUnit]> { let Latency = 1; }
+def : WriteRes<VecDFX2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 2; }
 def : WriteRes<VecFPd,  [Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
                          Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
                          Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
@@ -179,6 +181,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
 
 // Move character
 def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
 def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -268,6 +271,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
 
 def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -277,6 +281,9 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
 def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
+// Load multiple disjoint
+def : InstRW<[FXb, Lat30, GroupAlone], (instregex "LMD$")>;
+
 // Store multiple (estimated average of ceil(5/2) FXb ops)
 def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
               GroupAlone], (instregex "STM(G|H|Y)?$")>;
@@ -288,6 +295,7 @@ def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
 def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
 def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
@@ -345,7 +353,7 @@ def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXa], (instregex "ALR(K)?$")>;
 def : InstRW<[FXa], (instregex "AR(K)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "A(G)?SI$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
 
 // Logical addition with carry
 def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
@@ -438,11 +446,15 @@ def : InstRW<[FXa, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXa, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXa, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXa, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXa, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
 def : InstRW<[FXa, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
 def : InstRW<[LSU, FXa, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
 def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
@@ -456,7 +468,8 @@ def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
 def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SLA(K)?$")>;
+def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXa, FXa, FXa, FXa, Lat8], (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -505,7 +518,7 @@ def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
 
 // Compare logical character
 def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
 
 // Test under mask
@@ -516,6 +529,9 @@ def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
 def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
 
+// Compare logical characters under mask
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
@@ -563,6 +579,42 @@ def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
 def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 
 //===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXb, VecDF, FXb, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXb, VecDFX, LSU, LSU, Lat9, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXb, VecDFX2, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXb, FXb, VecDFX2, LSU, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
+def : InstRW<[VecDFX, LSU, Lat4, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
@@ -640,13 +692,30 @@ def : InstRW<[FXa], (instregex "ZEXT128_(32|64)$")>;
 
 // String instructions
 def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
 // Move with key
 def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
 
+// Monitor call
+def : InstRW<[FXb], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXb, Lat30], (instregex "ECAG$")>;
+
 // Extract CPU Time
 def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>;
 
+// Extract PSW
+def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
+
 // Execute
 def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
 
@@ -811,14 +880,17 @@ def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
 def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
 def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
 
+// Divide to integer
+def : InstRW<[VecFPd, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[VecXsPm, LSU, Lat8], (instregex "C(E|D)B$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "C(E|D)BR?$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXBR$")>;
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
 def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index a950e54e7601..670df8ff5541 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
 def Z196_FXUnit : ProcResource<2>;
 def Z196_LSUnit : ProcResource<2>;
 def Z196_FPUnit : ProcResource<1>;
+def Z196_DFUnit : ProcResource<1>;
 
 // Subtarget specific definitions of scheduling resources.
 def : WriteRes<FXU,       [Z196_FXUnit]> { let Latency = 1; }
@@ -66,6 +67,8 @@ def : WriteRes<LSU,       [Z196_LSUnit]> { let Latency = 4; }
 def : WriteRes<LSU_lat1,  [Z196_LSUnit]> { let Latency = 1; }
 def : WriteRes<FPU,       [Z196_FPUnit]> { let Latency = 8; }
 def : WriteRes<FPU2,      [Z196_FPUnit, Z196_FPUnit]> { let Latency = 9; }
+def : WriteRes<DFU,       [Z196_DFUnit]> { let Latency = 2; }
+def : WriteRes<DFU2,      [Z196_DFUnit, Z196_DFUnit]> { let Latency = 3; }
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
 
@@ -152,6 +155,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
 
 // Move character
 def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
 def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -226,6 +230,7 @@ def : InstRW<[LSU], (instregex "LLG(C|F|H|T|FRL|HRL)$")>;
 
 def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -235,6 +240,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
 def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
+// Load multiple disjoint
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+
 // Store multiple (estimated average of 3 ops)
 def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
              (instregex "STM(H|Y|G)?$")>;
@@ -246,6 +254,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
 def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
@@ -285,7 +294,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>;
 // Addition
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
 def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AH(Y)?$")>;
 def : InstRW<[FXU], (instregex "AIH$")>;
 def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
@@ -294,15 +303,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AGR(K)?$")>;
 def : InstRW<[FXU], (instregex "AHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
 def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
 def : InstRW<[FXU], (instregex "ALGHSIK$")>;
 def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
 // Logical addition with carry
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
@@ -395,11 +403,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "D$")>;
 def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
               (instregex "DSG(F)?R$")>;
 def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
@@ -416,7 +430,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
 def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU, Lat2], (instregex "SLA(K)?$")>;
+def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -465,7 +480,7 @@ def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "CGFR$")>;
 
 // Compare logical character
 def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
 
 // Test under mask
@@ -476,6 +491,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
 
+// Compare logical characters under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Prefetch
 //===----------------------------------------------------------------------===//
@@ -520,6 +538,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
 def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 
 //===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
@@ -571,13 +625,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
 // Move with key
 def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
 
+// Monitor call
+def : InstRW<[FXU], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
+
 // Extract CPU Time
 def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
 
+// Extract PSW
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
 
@@ -740,14 +811,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
 def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
+// Divide to integer
+def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
-def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
 def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 8ab6c826f1ed..1bdb8779dc72 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
 def ZEC12_FXUnit : ProcResource<2>;
 def ZEC12_LSUnit : ProcResource<2>;
 def ZEC12_FPUnit : ProcResource<1>;
+def ZEC12_DFUnit : ProcResource<1>;
 def ZEC12_VBUnit : ProcResource<1>;
 
 // Subtarget specific definitions of scheduling resources.
@@ -67,6 +68,8 @@ def : WriteRes<LSU,      [ZEC12_LSUnit]> { let Latency = 4; }
 def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
 def : WriteRes<FPU,  [ZEC12_FPUnit]> { let Latency = 8; }
 def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
+def : WriteRes<DFU,  [ZEC12_DFUnit]> { let Latency = 2; }
+def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_FPUnit]> { let Latency = 3; }
 def : WriteRes<VBU,  [ZEC12_VBUnit]>; // Virtual Branching Unit
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -155,6 +158,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
 
 // Move character
 def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
 def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -236,6 +240,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
 
 def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -245,6 +250,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
 def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
+// Load multiple disjoint
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "LMD$")>;
+
 // Store multiple (estimated average of 3 ops)
 def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
              (instregex "STM(H|Y|G)?$")>;
@@ -256,6 +264,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
 def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
@@ -295,7 +304,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>;
 // Addition
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
 def : InstRW<[FXU, LSU, Lat6], (instregex "AH(Y)?$")>;
 def : InstRW<[FXU], (instregex "AIH$")>;
 def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
@@ -304,15 +313,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AGR(K)?$")>;
 def : InstRW<[FXU], (instregex "AHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
 def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
 def : InstRW<[FXU], (instregex "ALGHSIK$")>;
 def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
 // Logical addition with carry
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
@@ -405,11 +413,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "D$")>;
 def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
               (instregex "DSG(F)?R$")>;
 def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
@@ -426,7 +440,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
 def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SLA(K)?$")>;
+def : InstRW<[FXU], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -475,7 +490,7 @@ def : InstRW<[FXU, Lat2], (instregex "CGFR$")>;
 
 // Compare logical character
 def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
 
 // Test under mask
@@ -486,6 +501,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
 
+// Compare logical characters under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
@@ -532,6 +550,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
 def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 
 //===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
@@ -609,13 +663,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
 // Move with key
 def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
 
+// Monitor call
+def : InstRW<[FXU], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
+
 // Extract CPU Time
 def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
 
+// Extract PSW
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
 
@@ -778,14 +849,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
 def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
+// Divide to integer
+def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
-def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
 def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index ce07ea3318a5..022679a7bc18 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -37,12 +37,13 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
                                    const TargetMachine &TM)
     : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
       HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
-      HasPopulationCount(false), HasFastSerialization(false),
-      HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
+      HasPopulationCount(false), HasMessageSecurityAssist4(false),
+      HasFastSerialization(false), HasInterlockedAccess1(false),
+      HasMiscellaneousExtensions(false),
       HasExecutionHint(false), HasLoadAndTrap(false),
       HasTransactionalExecution(false), HasProcessorAssist(false),
       HasVector(false), HasLoadStoreOnCond2(false),
-      HasLoadAndZeroRightmostByte(false),
+      HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), TSInfo(), FrameLowering() {}
 
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index cdb61327a16a..770dd7cd939f 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -39,6 +39,7 @@ protected:
   bool HasHighWord;
   bool HasFPExtension;
   bool HasPopulationCount;
+  bool HasMessageSecurityAssist4;
   bool HasFastSerialization;
   bool HasInterlockedAccess1;
   bool HasMiscellaneousExtensions;
@@ -49,6 +50,7 @@ protected:
   bool HasVector;
   bool HasLoadStoreOnCond2;
   bool HasLoadAndZeroRightmostByte;
+  bool HasMessageSecurityAssist5;
 
 private:
   Triple TargetTriple;
@@ -104,6 +106,10 @@ public:
   // Return true if the target has the population-count facility.
   bool hasPopulationCount() const { return HasPopulationCount; }
 
+  // Return true if the target has the message-security-assist
+  // extension facility 4.
+  bool hasMessageSecurityAssist4() const { return HasMessageSecurityAssist4; }
+
   // Return true if the target has the fast-serialization facility.
   bool hasFastSerialization() const { return HasFastSerialization; }
 
@@ -132,6 +138,10 @@ public:
     return HasLoadAndZeroRightmostByte;
   }
 
+  // Return true if the target has the message-security-assist
+  // extension facility 5.
+  bool hasMessageSecurityAssist5() const { return HasMessageSecurityAssist5; }
+
   // Return true if the target has the vector facility.
   bool hasVector() const { return HasVector; }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 73d1d4be293b..6b45839c14b0 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -19,8 +19,8 @@ let Defs = [ARGUMENTS] in {
 // Call sequence markers. These have an immediate which represents the amount of
 // stack space to allocate or free, which is used for varargs lowering.
 let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt),
-                         [(WebAssemblycallseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
+                         [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>;
 def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
                        [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
 } // isCodeGenOnly = 1
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index a601b575f579..fa2146f7db84 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -25,7 +25,8 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
 // WebAssembly-specific DAG Node Types.
 //===----------------------------------------------------------------------===//
 
-def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
+                                                  SDTCisVT<1, iPTR>]>;
 def SDT_WebAssemblyCallSeqEnd :
     SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 def SDT_WebAssemblyCall0    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 784c3a6557ff..3a421fe77392 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -235,6 +235,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
                                    "LEA instruction needs inputs at AG stage">;
 def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
                                    "LEA instruction with certain arguments is slow">;
+def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
+                                   "LEA instruction with 3 ops or certain registers is slow">;
 def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
                                    "INC and DEC instructions are slower than ADD and SUB">;
 def FeatureSoftFloat
@@ -480,6 +482,7 @@ def SNBFeatures : ProcessorFeatures<[], [
   FeatureXSAVE,
   FeatureXSAVEOPT,
   FeatureLAHFSAHF,
+  FeatureSlow3OpsLEA,
   FeatureFastScalarFSQRT,
   FeatureFastSHLDRotate
 ]>;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index ebd179e786da..fc3b4836c178 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -180,44 +180,6 @@ private:
 
 } // end anonymous namespace.
 
-static std::pair<X86::CondCode, bool>
-getX86ConditionCode(CmpInst::Predicate Predicate) {
-  X86::CondCode CC = X86::COND_INVALID;
-  bool NeedSwap = false;
-  switch (Predicate) {
-  default: break;
-  // Floating-point Predicates
-  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
-  case CmpInst::FCMP_OLT: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
-  case CmpInst::FCMP_OLE: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
-  case CmpInst::FCMP_UGT: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
-  case CmpInst::FCMP_UGE: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
-  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
-  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
-  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
-  case CmpInst::FCMP_OEQ:                         LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
-
-  // Integer Predicates
-  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
-  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
-  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
-  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
-  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
-  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
-  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
-  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
-  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
-  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
-  }
-
-  return std::make_pair(CC, NeedSwap);
-}
-
 static std::pair<unsigned, bool>
 getX86SSEConditionCode(CmpInst::Predicate Predicate) {
   unsigned CC;
@@ -1559,7 +1521,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 
   X86::CondCode CC;
   bool SwapArgs;
-  std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+  std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
   assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
   unsigned Opc = X86::getSETFromCond(CC);
 
@@ -1697,7 +1659,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 
       bool SwapArgs;
       unsigned BranchOpc;
-      std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+      std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
       assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
 
       BranchOpc = X86::GetCondBranchFromCond(CC);
@@ -2070,7 +2032,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
     }
 
     bool NeedSwap;
-    std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+    std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);
     assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
 
     const Value *CmpLHS = CI->getOperand(0);
@@ -2319,7 +2281,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
   const auto *CI = dyn_cast<CmpInst>(Cond);
   if (CI && (CI->getParent() == I->getParent())) {
     bool NeedSwap;
-    std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+    std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());
     if (CC > X86::LAST_VALID_COND)
       return false;
 
@@ -3293,7 +3255,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-    .addImm(NumBytes).addImm(0);
+    .addImm(NumBytes).addImm(0).addImm(0);
 
   // Walk the register/memloc assignments, inserting copies/loads.
   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 2cd4c1a3e7b3..9f649dad8bc0 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -27,20 +27,26 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "x86-fixup-LEAs"
+namespace llvm {
+void initializeFixupLEAPassPass(PassRegistry &);
+}
+
+#define FIXUPLEA_DESC "X86 LEA Fixup"
+#define FIXUPLEA_NAME "x86-fixup-LEAs"
+
+#define DEBUG_TYPE FIXUPLEA_NAME
 
 STATISTIC(NumLEAs, "Number of LEA instructions created");
 
 namespace {
 class FixupLEAPass : public MachineFunctionPass {
   enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
-  static char ID;
+
   /// \brief Loop over all of the instructions in the basic block
   /// replacing applicable instructions with LEA instructions,
   /// where appropriate.
   bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
 
-  StringRef getPassName() const override { return "X86 LEA Fixup"; }
 
   /// \brief Given a machine register, look for the instruction
   /// which writes it in the current basic block. If found,
@@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass {
   void processInstructionForSLM(MachineBasicBlock::iterator &I,
                                 MachineFunction::iterator MFI);
 
+
+  /// \brief Given a LEA instruction which is unprofitable
+  /// on SNB+ try to replace it with other instructions.
+  /// According to Intel's Optimization Reference Manual:
+  /// " For LEA instructions with three source operands and some specific
+  ///   situations, instruction latency has increased to 3 cycles, and must
+  ///   dispatch via port 1:
+  /// - LEA that has all three source operands: base, index, and offset
+  /// - LEA that uses base and index registers where the base is EBP, RBP,
+  ///   or R13
+  /// - LEA that uses RIP relative addressing mode
+  /// - LEA that uses 16-bit addressing mode "
+  /// This function currently handles the first 2 cases only.
+  MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
+                                          MachineFunction::iterator MFI);
+
   /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
   /// and convert them to INC or DEC respectively.
   bool fixupIncDec(MachineBasicBlock::iterator &I,
@@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass {
                                    MachineBasicBlock::iterator &MBBI) const;
 
 public:
-  FixupLEAPass() : MachineFunctionPass(ID) {}
+  static char ID;
+
+  StringRef getPassName() const override { return FIXUPLEA_DESC; }
+
+  FixupLEAPass() : MachineFunctionPass(ID) {
+    initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
+  }
 
   /// \brief Loop over all of the basic blocks,
   /// replacing instructions by equivalent LEA instructions
@@ -104,9 +132,12 @@ private:
   bool OptIncDec;
   bool OptLEA;
 };
-char FixupLEAPass::ID = 0;
 }
 
+char FixupLEAPass::ID = 0;
+
+INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false)
+
 MachineInstr *
 FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
                                  MachineBasicBlock::iterator &MBBI) const {
@@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
   OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
-  OptLEA = ST.LEAusesAG() || ST.slowLEA();
+  OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA();
 
   if (!OptLEA && !OptIncDec)
     return false;
@@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
   return MachineBasicBlock::iterator();
 }
 
-static inline bool isLEA(const int opcode) {
-  return opcode == X86::LEA16r || opcode == X86::LEA32r ||
-         opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+static inline bool isLEA(const int Opcode) {
+  return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+         Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+static inline bool isInefficientLEAReg(unsigned int Reg) {
+  return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13;
+}
+
+static inline bool isRegOperand(const MachineOperand &Op) {
+  return Op.isReg() && Op.getReg() != X86::NoRegister;
+}
+/// hasIneffecientLEARegs - LEA that uses base and index registers
+/// where the base is EBP, RBP, or R13
+static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
+                                            const MachineOperand &Index) {
+  return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
+         isRegOperand(Index);
+}
+
+static inline bool hasLEAOffset(const MachineOperand &Offset) {
+  return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
+}
+
+// LEA instruction that has all three operands: offset, base and index
+static inline bool isThreeOperandsLEA(const MachineOperand &Base,
+                                      const MachineOperand &Index,
+                                      const MachineOperand &Offset) {
+  return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset);
+}
+
+static inline int getADDrrFromLEA(int LEAOpcode) {
+  switch (LEAOpcode) {
+  default:
+    llvm_unreachable("Unexpected LEA instruction");
+  case X86::LEA16r:
+    return X86::ADD16rr;
+  case X86::LEA32r:
+    return X86::ADD32rr;
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+    return X86::ADD64rr;
+  }
+}
+
+static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
+  bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
+  switch (LEAOpcode) {
+  default:
+    llvm_unreachable("Unexpected LEA instruction");
+  case X86::LEA16r:
+    return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri;
+  case X86::LEA32r:
+  case X86::LEA64_32r:
+    return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri;
+  case X86::LEA64r:
+    return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32;
+  }
 }
 
 /// isLEASimpleIncOrDec - Does this LEA have one these forms:
@@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
 void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
                                             MachineFunction::iterator MFI) {
   MachineInstr &MI = *I;
-  const int opcode = MI.getOpcode();
-  if (!isLEA(opcode))
+  const int Opcode = MI.getOpcode();
+  if (!isLEA(Opcode))
     return;
   if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() ||
       !TII->isSafeToClobberEFLAGS(*MFI, I))
@@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     return;
   if (MI.getOperand(2).getImm() > 1)
     return;
-  int addrr_opcode, addri_opcode;
-  switch (opcode) {
-  default:
-    llvm_unreachable("Unexpected LEA instruction");
-  case X86::LEA16r:
-    addrr_opcode = X86::ADD16rr;
-    addri_opcode = X86::ADD16ri;
-    break;
-  case X86::LEA32r:
-    addrr_opcode = X86::ADD32rr;
-    addri_opcode = X86::ADD32ri;
-    break;
-  case X86::LEA64_32r:
-  case X86::LEA64r:
-    addrr_opcode = X86::ADD64rr;
-    addri_opcode = X86::ADD64ri32;
-    break;
-  }
   DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
   DEBUG(dbgs() << "FixLEA: Replaced by: ";);
   MachineInstr *NewMI = nullptr;
-  const MachineOperand &Dst = MI.getOperand(0);
   // Make ADD instruction for two registers writing to LEA's destination
   if (SrcR1 != 0 && SrcR2 != 0) {
-    const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
-    const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
-    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
-                .add(Dst)
-                .add(Src1)
-                .add(Src2);
-    MFI->insert(I, NewMI);
+    const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode));
+    const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1);
+    NewMI =
+        BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
     DEBUG(NewMI->dump(););
   }
   // Make ADD instruction for immediate
   if (MI.getOperand(4).getImm() != 0) {
+    const MCInstrDesc &ADDri =
+        TII->get(getADDriFromLEA(Opcode, MI.getOperand(4)));
     const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
-    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
-                .add(Dst)
+    NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
                 .add(SrcR)
                 .addImm(MI.getOperand(4).getImm());
-    MFI->insert(I, NewMI);
     DEBUG(NewMI->dump(););
   }
   if (NewMI) {
     MFI->erase(I);
-    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+    I = NewMI;
+  }
+}
+
+MachineInstr *
+FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
+                                        MachineFunction::iterator MFI) {
+
+  const int LEAOpcode = MI.getOpcode();
+  if (!isLEA(LEAOpcode))
+    return nullptr;
+
+  const MachineOperand &Dst = MI.getOperand(0);
+  const MachineOperand &Base = MI.getOperand(1);
+  const MachineOperand &Scale = MI.getOperand(2);
+  const MachineOperand &Index = MI.getOperand(3);
+  const MachineOperand &Offset = MI.getOperand(4);
+  const MachineOperand &Segment = MI.getOperand(5);
+
+  if (!(isThreeOperandsLEA(Base, Index, Offset) ||
+        hasInefficientLEABaseReg(Base, Index)) ||
+      !TII->isSafeToClobberEFLAGS(*MFI, MI) ||
+      Segment.getReg() != X86::NoRegister)
+    return nullptr;
+
+  unsigned int DstR = Dst.getReg();
+  unsigned int BaseR = Base.getReg();
+  unsigned int IndexR = Index.getReg();
+  unsigned SSDstR =
+      (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
+  bool IsScale1 = Scale.getImm() == 1;
+  bool IsInefficientBase = isInefficientLEAReg(BaseR);
+  bool IsInefficientIndex = isInefficientLEAReg(IndexR);
+
+  // Skip these cases since it takes more than 2 instructions
+  // to replace the LEA instruction.
+  if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
+    return nullptr;
+  if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
+      (IsInefficientIndex || !IsScale1))
+    return nullptr;
+
+  const DebugLoc DL = MI.getDebugLoc();
+  const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
+  const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
+
+  DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
+  DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+
+  // First try to replace LEA with one or two (for the 3-op LEA case)
+  // add instructions:
+  // 1.lea (%base,%index,1), %base => add %index,%base
+  // 2.lea (%base,%index,1), %index => add %base,%index
+  if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
+    const MachineOperand &Src = DstR == BaseR ? Index : Base;
+    MachineInstr *NewMI =
+        BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
+    DEBUG(NewMI->dump(););
+    // Create ADD instruction for the Offset in case of 3-Ops LEA.
+    if (hasLEAOffset(Offset)) {
+      NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+      DEBUG(NewMI->dump(););
+    }
+    return NewMI;
+  }
+  // If the base is inefficient try switching the index and base operands,
+  // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+  // lea offset(%base,%index,scale),%dst =>
+  // lea (%base,%index,scale); add offset,%dst
+  if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+    MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+                              .add(Dst)
+                              .add(IsInefficientBase ? Index : Base)
+                              .add(Scale)
+                              .add(IsInefficientBase ? Base : Index)
+                              .addImm(0)
+                              .add(Segment);
+    DEBUG(NewMI->dump(););
+    // Create ADD instruction for the Offset in case of 3-Ops LEA.
+    if (hasLEAOffset(Offset)) {
+      NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+      DEBUG(NewMI->dump(););
+    }
+    return NewMI;
+  }
+  // Handle the rest of the cases with inefficient base register:
+  assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
+  assert(IsInefficientBase && "efficient base should be handled already!");
+
+  // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
+  if (IsScale1 && !hasLEAOffset(Offset)) {
+    TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill());
+    DEBUG(MI.getPrevNode()->dump(););
+
+    MachineInstr *NewMI =
+        BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+    DEBUG(NewMI->dump(););
+    return NewMI;
   }
+  // lea offset(%base,%index,scale), %dst =>
+  // lea offset( ,%index,scale), %dst; add %base,%dst
+  MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+                            .add(Dst)
+                            .addReg(0)
+                            .add(Scale)
+                            .add(Index)
+                            .add(Offset)
+                            .add(Segment);
+  DEBUG(NewMI->dump(););
+
+  NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+  DEBUG(NewMI->dump(););
+  return NewMI;
 }
 
 bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
@@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
     if (OptLEA) {
       if (MF.getSubtarget<X86Subtarget>().isSLM())
         processInstructionForSLM(I, MFI);
-      else
-        processInstruction(I, MFI);
+
+      else {
+        if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) {
+          if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
+            MFI->erase(I);
+            I = NewMI;
+          }
+        } else
+          processInstruction(I, MFI);
+      }
     }
   }
   return false;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 12a10bf3072f..c899f0fd5100 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1178,8 +1178,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
       break;
 
-    if (ConstantSDNode
-          *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       unsigned Val = CN->getZExtValue();
       // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
       // that the base operand remains free for further matching. If
@@ -1187,15 +1186,14 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
       if (Val == 1 || Val == 2 || Val == 3) {
         AM.Scale = 1 << Val;
-        SDValue ShVal = N.getNode()->getOperand(0);
+        SDValue ShVal = N.getOperand(0);
 
         // Okay, we know that we have a scale by now.  However, if the scaled
         // value is an add of something and a constant, we can fold the
         // constant into the disp field here.
         if (CurDAG->isBaseWithConstantOffset(ShVal)) {
-          AM.IndexReg = ShVal.getNode()->getOperand(0);
-          ConstantSDNode *AddVal =
-            cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+          AM.IndexReg = ShVal.getOperand(0);
+          ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
           uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
           if (!foldOffsetIntoAddress(Disp, AM))
             return false;
@@ -1245,28 +1243,27 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
         AM.Base_Reg.getNode() == nullptr &&
         AM.IndexReg.getNode() == nullptr) {
-      if (ConstantSDNode
-            *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
             CN->getZExtValue() == 9) {
           AM.Scale = unsigned(CN->getZExtValue())-1;
 
-          SDValue MulVal = N.getNode()->getOperand(0);
+          SDValue MulVal = N.getOperand(0);
           SDValue Reg;
 
           // Okay, we know that we have a scale by now.  However, if the scaled
           // value is an add of something and a constant, we can fold the
           // constant into the disp field here.
           if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
-              isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
-            Reg = MulVal.getNode()->getOperand(0);
+              isa<ConstantSDNode>(MulVal.getOperand(1))) {
+            Reg = MulVal.getOperand(0);
             ConstantSDNode *AddVal =
-              cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+              cast<ConstantSDNode>(MulVal.getOperand(1));
             uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
             if (foldOffsetIntoAddress(Disp, AM))
-              Reg = N.getNode()->getOperand(0);
+              Reg = N.getOperand(0);
           } else {
-            Reg = N.getNode()->getOperand(0);
+            Reg = N.getOperand(0);
           }
 
           AM.IndexReg = AM.Base_Reg = Reg;
@@ -1289,7 +1286,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
     // Test if the LHS of the sub can be folded.
     X86ISelAddressMode Backup = AM;
-    if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+    if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
       AM = Backup;
       break;
     }
@@ -1300,7 +1297,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     }
 
     int Cost = 0;
-    SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+    SDValue RHS = Handle.getValue().getOperand(1);
     // If the RHS involves a register with multiple uses, this
     // transformation incurs an extra mov, due to the neg instruction
     // clobbering its operand.
@@ -1309,7 +1306,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
         RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
         RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
         (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
-         RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+         RHS.getOperand(0).getValueType() == MVT::i32))
       ++Cost;
     // If the base is a register with multiple uses, this
     // transformation may save a mov.
@@ -2524,7 +2521,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         N0.getNode()->hasOneUse() &&
         N0.getValueType() != MVT::i8 &&
         X86::isZeroNode(N1)) {
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
       if (!C) break;
 
       // For example, convert "testl %eax, $8" to "testb %al, $8"
@@ -2532,7 +2529,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
           (!(C->getZExtValue() & 0x80) ||
            hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // On x86-32, only the ABCD registers have 8-bit subregisters.
         if (!Subtarget->is64Bit()) {
@@ -2568,7 +2565,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         // Shift the immediate right by 8 bits.
         SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
                                                        dl, MVT::i8);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // Put the value in an ABCD register.
         const TargetRegisterClass *TRC;
@@ -2605,7 +2602,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
            hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
                                                 MVT::i16);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // Extract the 16-bit subregister.
         SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
@@ -2628,7 +2625,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
            hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
                                                 MVT::i32);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // Extract the 32-bit subregister.
         SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9ee2234595f9..11c08292518a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -79,6 +80,17 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
              " of the loop header PC will be 0)."),
     cl::Hidden);
 
+/// Call this when the user attempts to do something unsupported, like
+/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
+/// report_fatal_error, so calling code should attempt to recover without
+/// crashing.
+static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
+                             const char *Msg) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  DAG.getContext()->diagnose(
+      DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
+}
+
 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                      const X86Subtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
@@ -1381,7 +1393,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
-      setOperationAction(ISD::VSELECT,             VT, Legal);
+      setOperationAction(ISD::VSELECT,             VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
@@ -1445,8 +1457,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
@@ -1479,7 +1489,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
-      setOperationAction(ISD::VSELECT,      VT, Legal);
+      setOperationAction(ISD::VSELECT,      VT, Custom);
       setOperationAction(ISD::ABS,          VT, Legal);
       setOperationAction(ISD::SRL,          VT, Custom);
       setOperationAction(ISD::SHL,          VT, Custom);
@@ -2207,15 +2217,17 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     // or SSE or MMX vectors.
     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
-          (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
-      report_fatal_error("SSE register return with SSE disabled");
+        (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
+      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+    } else if (ValVT == MVT::f64 &&
+               (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
+      // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
+      // llvm-gcc has never done it right and no one has noticed, so this
+      // should be OK for now.
+      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
+      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     }
-    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
-    // llvm-gcc has never done it right and no one has noticed, so this
-    // should be OK for now.
-    if (ValVT == MVT::f64 &&
-        (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
-      report_fatal_error("SSE2 register return with SSE2 disabled");
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
     // the RET instruction and handled by the FP Stackifier.
@@ -2528,7 +2540,8 @@ SDValue X86TargetLowering::LowerCallResult(
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
-      report_fatal_error("SSE register return with SSE disabled");
+      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     }
 
     // If we prefer to use the value in xmm registers, copy it out as f80 and
@@ -3415,8 +3428,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (!IsSibcall)
-    Chain = DAG.getCALLSEQ_START(
-        Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
+                                 NumBytes - NumBytesToPush, dl);
 
   SDValue RetAddrFrIdx;
   // Load return address for tail calls.
@@ -6912,9 +6925,9 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
 
   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
   if (IsSplat)
-    return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
-                       DAG.getConstant(1, dl, VT),
-                       DAG.getConstant(0, dl, VT));
+    return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
+                         DAG.getConstant(1, dl, VT),
+                         DAG.getConstant(0, dl, VT));
 
   // insert elements one by one
   SDValue DstVec;
@@ -8386,9 +8399,9 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
                               Subtarget, DAG, DL);
   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
-  return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
-                     DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
-                     ZeroVector);
+  return DAG.getSelect(DL, VT, VMask,
+                       DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+                       ZeroVector);
 }
 
 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
@@ -8748,8 +8761,9 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     V1 = DAG.getBitcast(BlendVT, V1);
     V2 = DAG.getBitcast(BlendVT, V2);
     return DAG.getBitcast(
-        VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
-                        DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
+        VT,
+        DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
+                      V1, V2));
   }
   case MVT::v16f32:
   case MVT::v8f64:
@@ -13817,6 +13831,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
     return SDValue();
 
+  // If this VSELECT has a vector if i1 as a mask, it will be directly matched
+  // with patterns on the mask registers on AVX-512.
+  if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
+    return Op;
+
   // Try to lower this to a blend-style vector shuffle. This can handle all
   // constant condition cases.
   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
@@ -13826,10 +13845,30 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget.hasSSE41())
     return SDValue();
 
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
+  // into an i1 condition so that we can use the mask-based 512-bit blend
+  // instructions.
+  if (VT.getSizeInBits() == 512) {
+    SDValue Cond = Op.getOperand(0);
+    // The vNi1 condition case should be handled above as it can be trivially
+    // lowered.
+    assert(Cond.getValueType().getScalarSizeInBits() ==
+               VT.getScalarSizeInBits() &&
+           "Should have a size-matched integer condition!");
+    // Build a mask by testing the condition against itself (tests for zero).
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
+    // Now return a new VSELECT using the mask.
+    return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
+  }
+
   // Only some types will be legal on some subtargets. If we can emit a legal
   // VSELECT-matching blend, return Op, and but if we need to expand, return
   // a null value.
-  switch (Op.getSimpleValueType().SimpleTy) {
+  switch (VT.SimpleTy) {
   default:
     // Most of the vector types have blends past SSE4.1.
     return Op;
@@ -14725,7 +14764,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // location.
     SDValue Chain = DAG.getEntryNode();
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
+    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
     SDValue Args[] = { Chain, Offset };
     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
@@ -15348,8 +15387,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   SDValue Zero = DAG.getIntPtrConstant(0, dl);
   SDValue Four = DAG.getIntPtrConstant(4, dl);
-  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
-                               Zero, Four);
+  SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
 
   // Load the value out, extending it from f32 to f80.
@@ -15621,7 +15659,7 @@ static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
   SDValue Zero =
    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
 
-  SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+  SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
   if (VT == ExtVT)
     return SelectedVal;
   return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
@@ -16713,7 +16751,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
       if (BitWidth > AndBitWidth) {
         KnownBits Known;
         DAG.computeKnownBits(Op0, Known);
-        if (Known.Zero.countLeadingOnes() < BitWidth - AndBitWidth)
+        if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
           return SDValue();
       }
       LHS = Op1;
@@ -17455,7 +17493,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
         VCmp = DAG.getBitcast(VCmpVT, VCmp);
 
-        SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+        SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
 
         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
                            VSel, DAG.getIntPtrConstant(0, DL));
@@ -17483,9 +17521,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
       Op2Scalar = Op2.getOperand(0);
     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
-      SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
-                                      Op1Scalar.getValueType(),
-                                      Cond, Op1Scalar, Op2Scalar);
+      SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
+                                        Op1Scalar, Op2Scalar);
       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
         return DAG.getBitcast(VT, newSelect);
       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
@@ -17500,8 +17537,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
-    SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
-                                    Cond, Op1, Op2);
+    SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
   }
 
@@ -17770,7 +17806,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
   } else {
     SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
     SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
-    V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+    V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
     if (ExtVT == VT)
       return V;
   }
@@ -18572,7 +18608,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   bool Is64Bit = Subtarget.is64Bit();
   MVT SPTy = getPointerTy(DAG.getDataLayout());
@@ -19021,8 +19057,10 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
                                     SDValue PreservedSrc,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
-  if (isAllOnesConstant(Mask))
-    return Op;
+
+  if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
+    if (MaskConst->getZExtValue() & 0x1)
+      return Op;
 
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -19081,7 +19119,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   // registration, or the .set_setframe offset.
   MCSymbol *OffsetSym =
       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
-          GlobalValue::getRealLinkageName(Fn->getName()));
+          GlobalValue::dropLLVMManglingEscape(Fn->getName()));
   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
   SDValue ParentFrameOffset =
       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
@@ -19683,12 +19721,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
                                 DAG.getIntPtrConstant(0, dl));
       return DAG.getBitcast(Op.getValueType(), Res);
     }
-    case CONVERT_MASK_TO_VEC: {
-      SDValue Mask = Op.getOperand(1);
-      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
-      return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
-    }
     case BRCST_SUBVEC_TO_VEC: {
       SDValue Src = Op.getOperand(1);
       SDValue Passthru = Op.getOperand(2);
@@ -19932,7 +19964,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
     SDValue Op1 = Op.getOperand(1);
     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
-        GlobalValue::getRealLinkageName(Fn->getName()));
+        GlobalValue::dropLLVMManglingEscape(Fn->getName()));
 
     // Generate a simple absolute symbol reference. This intrinsic is only
     // supported on 32-bit Windows, which isn't PIC.
@@ -21741,6 +21773,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
     SDValue Ex = DAG.getBitcast(ExVT, R);
 
+    // ashr(R, 63) === cmp_slt(R, 0)
+    if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
+      assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
+             "Unsupported PCMPGT op");
+      return DAG.getNode(X86ISD::PCMPGT, dl, VT,
+                         getZeroVector(VT, Subtarget, DAG, dl), R);
+    }
+
     if (ShiftAmt >= 32) {
       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
       SDValue Upper =
@@ -21839,10 +21879,19 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   }
 
   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+  // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
   if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
       (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
        (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
 
+    // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
+    unsigned SubVectorScale = 1;
+    if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      SubVectorScale =
+          Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
+      Amt = Amt.getOperand(0);
+    }
+
     // Peek through any splat that was introduced for i64 shift vectorization.
     int SplatIndex = -1;
     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
@@ -21859,7 +21908,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
     Amt = Amt.getOperand(0);
     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
-                     VT.getVectorNumElements();
+                     (SubVectorScale * VT.getVectorNumElements());
     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
     uint64_t ShiftAmt = 0;
     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
@@ -22233,23 +22282,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
         Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
-        return DAG.getBitcast(SelVT,
-                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+        return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       } else if (Subtarget.hasSSE41()) {
         // On SSE41 targets we make use of the fact that VSELECT lowers
         // to PBLENDVB which selects bytes based just on the sign bit.
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
-        return DAG.getBitcast(SelVT,
-                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+        return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we test for the sign bit by comparing to
       // zero - a negative value will set all bits of the lanes to true
       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
-      return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
+      return DAG.getSelect(dl, SelVT, C, V0, V1);
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
@@ -22371,15 +22418,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         V0 = DAG.getBitcast(ExtVT, V0);
         V1 = DAG.getBitcast(ExtVT, V1);
         Sel = DAG.getBitcast(ExtVT, Sel);
-        return DAG.getBitcast(
-            VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
+        return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we splat the sign bit - a negative value will
       // set all bits of the lanes to true and VSELECT uses that in
       // its OR(AND(V0,C),AND(V1,~C)) lowering.
       SDValue C =
           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
-      return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
+      return DAG.getSelect(dl, VT, C, V0, V1);
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
@@ -23296,9 +23342,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
-  Type *RetTy = isF64
-    ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
-    : (Type*)VectorType::get(ArgTy, 4);
+  Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
+                      : (Type *)VectorType::get(ArgTy, 4);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
@@ -25779,7 +25824,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
   // Emit CALLSEQ_START right before the instruction.
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   MachineInstrBuilder CallseqStart =
-    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
+    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
 
   // Emit CALLSEQ_END right after the instruction.
@@ -26517,7 +26562,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, BB);
-    
+
   case TargetOpcode::PATCHABLE_EVENT_CALL:
     // Do nothing here, handle in xray instrumentation pass.
     return BB;
@@ -29532,7 +29577,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
     SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
                                   DAG.getAllOnesConstant(DL, CondVT));
     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
-    return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+    return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
   }
 
   // To use the condition operand as a bitwise mask, it must have elements that
@@ -30015,7 +30060,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
-      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+      return DAG.getSelect(DL, VT, Cond, LHS, RHS);
     }
     }
   }
@@ -31561,20 +31606,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
 //   (sub (xor X, M), M)
 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget) {
-  assert(N->getOpcode() == ISD::OR);
+  assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
-  if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
+  if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
+        (VT.is256BitVector() && Subtarget.hasInt256())))
     return SDValue();
-  assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
 
-  // Canonicalize pandn to RHS
-  if (N0.getOpcode() == X86ISD::ANDNP)
+  // Canonicalize AND to LHS.
+  if (N1.getOpcode() == ISD::AND)
     std::swap(N0, N1);
 
+  // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
+  // ANDNP combine allows other combines to happen that prevent matching.
   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
     return SDValue();
 
@@ -31596,20 +31643,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   Y = peekThroughBitcasts(Y);
 
   EVT MaskVT = Mask.getValueType();
-
-  // Validate that the Mask operand is a vector sra node.
-  // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
-  // there is no psrai.b
   unsigned EltBits = MaskVT.getScalarSizeInBits();
-  unsigned SraAmt = ~0;
-  if (Mask.getOpcode() == ISD::SRA) {
-    if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
-      if (auto *AmtConst = AmtBV->getConstantSplatNode())
-        SraAmt = AmtConst->getZExtValue();
-  } else if (Mask.getOpcode() == X86ISD::VSRAI)
-    SraAmt = Mask.getConstantOperandVal(1);
 
-  if ((SraAmt + 1) != EltBits)
+  // TODO: Attempt to handle floating point cases as well?
+  if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
     return SDValue();
 
   SDLoc DL(N);
@@ -31630,7 +31667,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   //   (add (xor X, M), (and M, 1))
   // And further to:
   //   (sub (xor X, M), M)
-  if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+  if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
+      DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
     auto IsNegV = [](SDNode *N, SDValue V) {
       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
@@ -31642,9 +31680,6 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
       V = Y;
 
     if (V) {
-      if (EltBits != 8 && EltBits != 16 && EltBits != 32)
-        return SDValue();
-
       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
       SDValue SubOp2 = Mask;
 
@@ -31661,8 +31696,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
       if (V == Y)
          std::swap(SubOp1, SubOp2);
 
-      return DAG.getBitcast(VT,
-                            DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
+      SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+      return DAG.getBitcast(VT, Res);
     }
   }
 
@@ -31675,7 +31710,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   X = DAG.getBitcast(BlendVT, X);
   Y = DAG.getBitcast(BlendVT, Y);
   Mask = DAG.getBitcast(BlendVT, Mask);
-  Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+  Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
   return DAG.getBitcast(VT, Mask);
 }
 
@@ -33655,8 +33690,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
 
   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
   // are NaN, the NaN value of Op1 is the result.
-  auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
-  return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+  return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
 }
 
 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
@@ -33949,7 +33983,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
     if (InVT == MVT::i1) {
       SDValue Zero = DAG.getConstant(0, DL, VT);
       SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
-      return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
+      return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
     }
     return SDValue();
   }
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 3dc673e3c35a..d003d027ddb9 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -43,7 +43,8 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [ESP, EFLAGS], Uses = [ESP] in {
-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs),
+                           (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
                            "#ADJCALLSTACKDOWN",
                            []>,
                           Requires<[NotLP64]>;
@@ -52,8 +53,8 @@ def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[NotLP64]>;
 }
-def : Pat<(X86callseq_start timm:$amt1),
-          (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+       (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>;
 
 
 // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
@@ -62,7 +63,8 @@ def : Pat<(X86callseq_start timm:$amt1),
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [RSP, EFLAGS], Uses = [RSP] in {
-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs),
+                           (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
                            "#ADJCALLSTACKDOWN",
                            []>,
                           Requires<[IsLP64]>;
@@ -71,8 +73,8 @@ def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[IsLP64]>;
 }
-def : Pat<(X86callseq_start timm:$amt1),
-          (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+        (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>;
 
 
 // x86-64 va_start lowering magic.
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 888daa275265..092ceb207ada 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -5729,6 +5729,44 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) {
   }
 }
 
+std::pair<X86::CondCode, bool>
+X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
+  X86::CondCode CC = X86::COND_INVALID;
+  bool NeedSwap = false;
+  switch (Predicate) {
+  default: break;
+  // Floating-point Predicates
+  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
+  case CmpInst::FCMP_OLT: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
+  case CmpInst::FCMP_OLE: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
+  case CmpInst::FCMP_UGT: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::FCMP_UGE: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
+  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
+  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
+  case CmpInst::FCMP_OEQ:                         LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+  // Integer Predicates
+  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
+  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
+  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
+  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
+  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
+  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
+  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
+  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
 /// Return a set opcode for the given condition and
 /// whether it has memory operand.
 unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
@@ -7589,6 +7627,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
   case X86::AVX2_SETALLONES:
     return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+  case X86::AVX1_SETALLONES: {
+    unsigned Reg = MIB->getOperand(0).getReg();
+    // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
+    MIB->setDesc(get(X86::VCMPPSYrri));
+    MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
+    return true;
+  }
   case X86::AVX512_512_SETALLONES: {
     unsigned Reg = MIB->getOperand(0).getReg();
     MIB->setDesc(get(X86::VPTERNLOGDZrri));
@@ -8477,6 +8522,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       Alignment = 64;
       break;
     case X86::AVX2_SETALLONES:
+    case X86::AVX1_SETALLONES:
     case X86::AVX_SET0:
     case X86::AVX512_256_SET0:
       Alignment = 32;
@@ -8522,6 +8568,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   case X86::V_SET0:
   case X86::V_SETALLONES:
   case X86::AVX2_SETALLONES:
+  case X86::AVX1_SETALLONES:
   case X86::AVX_SET0:
   case X86::AVX512_128_SET0:
   case X86::AVX512_256_SET0:
@@ -8563,13 +8610,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
     else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
-             Opc == X86::AVX512_256_SET0)
+             Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
 
     bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
-                      Opc == X86::AVX512_512_SETALLONES);
+                      Opc == X86::AVX512_512_SETALLONES ||
+                      Opc == X86::AVX1_SETALLONES);
     const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
                                     Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 38567831b3a4..e64876073ccf 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -64,6 +64,10 @@ enum CondCode {
 // Turn condition code into conditional branch opcode.
 unsigned GetCondBranchFromCond(CondCode CC);
 
+/// \brief Return a pair of condition code for the given predicate and whether
+/// the instruction operands should be swaped to match the condition code.
+std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
+
 /// \brief Return a set opcode for the given condition and whether it has
 /// a memory operand.
 unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
@@ -186,6 +190,8 @@ public:
   /// setup..destroy sequence (e.g. by pushes, or inside the callee).
   int64_t getFrameAdjustment(const MachineInstr &I) const {
     assert(isFrameInstr(I));
+    if (isFrameSetup(I))
+      return I.getOperand(2).getImm();
     return I.getOperand(1).getImm();
   }
 
@@ -193,7 +199,10 @@ public:
   /// instruction.
   void setFrameAdjustment(MachineInstr &I, int64_t V) const {
     assert(isFrameInstr(I));
-    I.getOperand(1).setImm(V);
+    if (isFrameSetup(I))
+      I.getOperand(2).setImm(V);
+    else
+      I.getOperand(1).setImm(V);
   }
 
   /// getSPAdjust - This returns the stack pointer adjustment made by
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 902b0c2c04e3..4d7d8ece92d9 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -84,7 +84,8 @@ def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
 
 def SDTX86Ret     : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
 
-def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32>]>;
 def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
                                         SDTCisVT<1, i32>]>;
 
@@ -2351,6 +2352,38 @@ let Predicates = [HasBMI2] in {
   def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
             (BZHI64rm addr:$src,
               (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  // x & (-1 >> (32 - y))
+  def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
+            (BZHI32rr GR32:$src, GR32:$lz)>;
+  def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
+            (BZHI32rm addr:$src, GR32:$lz)>;
+
+  // x & (-1 >> (64 - y))
+  def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+  def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+
+  // x << (32 - y) >> (32 - y)
+  def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))),
+                 (i8 (trunc (sub 32, GR32:$lz)))),
+            (BZHI32rr GR32:$src, GR32:$lz)>;
+  def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))),
+                 (i8 (trunc (sub 32, GR32:$lz)))),
+            (BZHI32rm addr:$src, GR32:$lz)>;
+
+  // x << (64 - y) >> (64 - y)
+  def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))),
+                 (i8 (trunc (sub 64, GR32:$lz)))),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+  def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))),
+                 (i8 (trunc (sub 64, GR32:$lz)))),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
 } // HasBMI2
 
 let Predicates = [HasBMI] in {
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 48da2fa607af..f73d85e7e01b 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -486,6 +486,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1, SchedRW = [WriteZero] in {
   def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
                        [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+  let Predicates = [HasAVX1Only, OptForMinSize] in {
+  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+  }
   let Predicates = [HasAVX2] in
   def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
                           [(set VR256:$dst, (v8i32 immAllOnesV))]>;
@@ -7755,14 +7759,12 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
 }
 
-
-// Without AVX2 we need to concat two v4i32 V_SETALLONES to create a 256-bit
-// all ones value.
-let Predicates = [HasAVX1Only] in
-def : Pat<(v8i32 immAllOnesV),
-          (VINSERTF128rr
-           (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), (V_SETALLONES), sub_xmm),
-           (V_SETALLONES), 1)>;
+// To create a 256-bit all ones value, we should produce VCMPTRUEPS
+// with YMM register containing zero.
+// FIXME: Avoid producing vxorps to clear the fake inputs.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
+}
 
 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
                             PatFrag memop_frag> {
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index d65eb1de8d09..de58d719acb4 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -56,13 +56,9 @@ private:
   bool selectImpl(MachineInstr &I) const;
 
   // TODO: remove after suported by Tablegen-erated instruction selection.
-  unsigned getFAddOp(LLT &Ty, const RegisterBank &RB) const;
-  unsigned getFSubOp(LLT &Ty, const RegisterBank &RB) const;
   unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc,
                           uint64_t Alignment) const;
 
-  bool selectBinaryOp(MachineInstr &I, MachineRegisterInfo &MRI,
-                      MachineFunction &MF) const;
   bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI,
                          MachineFunction &MF) const;
   bool selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -71,6 +67,10 @@ private:
                       MachineFunction &MF) const;
   bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI,
                    MachineFunction &MF) const;
+  bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI,
+                  MachineFunction &MF) const;
+  bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
+                 MachineFunction &MF) const;
 
   const X86TargetMachine &TM;
   const X86Subtarget &STI;
@@ -226,13 +226,11 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
          "Generic instruction has unexpected implicit operands\n");
 
   if (selectImpl(I))
-     return true;
+    return true;
 
   DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
 
   // TODO: This should be implemented by tblgen.
-  if (selectBinaryOp(I, MRI, MF))
-    return true;
   if (selectLoadStoreOp(I, MRI, MF))
     return true;
   if (selectFrameIndexOrGep(I, MRI, MF))
@@ -241,109 +239,14 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
     return true;
   if (selectTrunc(I, MRI, MF))
     return true;
+  if (selectZext(I, MRI, MF))
+    return true;
+  if (selectCmp(I, MRI, MF))
+    return true;
 
   return false;
 }
 
-unsigned X86InstructionSelector::getFAddOp(LLT &Ty,
-                                           const RegisterBank &RB) const {
-
-  if (X86::VECRRegBankID != RB.getID())
-    return TargetOpcode::G_FADD;
-
-  if (Ty == LLT::scalar(32)) {
-    if (STI.hasAVX512()) {
-      return X86::VADDSSZrr;
-    } else if (STI.hasAVX()) {
-      return X86::VADDSSrr;
-    } else if (STI.hasSSE1()) {
-      return X86::ADDSSrr;
-    }
-  } else if (Ty == LLT::scalar(64)) {
-    if (STI.hasAVX512()) {
-      return X86::VADDSDZrr;
-    } else if (STI.hasAVX()) {
-      return X86::VADDSDrr;
-    } else if (STI.hasSSE2()) {
-      return X86::ADDSDrr;
-    }
-  } else if (Ty == LLT::vector(4, 32)) {
-    if ((STI.hasAVX512()) && (STI.hasVLX())) {
-      return X86::VADDPSZ128rr;
-    } else if (STI.hasAVX()) {
-      return X86::VADDPSrr;
-    } else if (STI.hasSSE1()) {
-      return X86::ADDPSrr;
-    }
-  }
-
-  return TargetOpcode::G_FADD;
-}
-
-unsigned X86InstructionSelector::getFSubOp(LLT &Ty,
-                                           const RegisterBank &RB) const {
-
-  if (X86::VECRRegBankID != RB.getID())
-    return TargetOpcode::G_FSUB;
-
-  if (Ty == LLT::scalar(32)) {
-    if (STI.hasAVX512()) {
-      return X86::VSUBSSZrr;
-    } else if (STI.hasAVX()) {
-      return X86::VSUBSSrr;
-    } else if (STI.hasSSE1()) {
-      return X86::SUBSSrr;
-    }
-  } else if (Ty == LLT::scalar(64)) {
-    if (STI.hasAVX512()) {
-      return X86::VSUBSDZrr;
-    } else if (STI.hasAVX()) {
-      return X86::VSUBSDrr;
-    } else if (STI.hasSSE2()) {
-      return X86::SUBSDrr;
-    }
-  } else if (Ty == LLT::vector(4, 32)) {
-    if ((STI.hasAVX512()) && (STI.hasVLX())) {
-      return X86::VSUBPSZ128rr;
-    } else if (STI.hasAVX()) {
-      return X86::VSUBPSrr;
-    } else if (STI.hasSSE1()) {
-      return X86::SUBPSrr;
-    }
-  }
-
-  return TargetOpcode::G_FSUB;
-}
-
-bool X86InstructionSelector::selectBinaryOp(MachineInstr &I,
-                                            MachineRegisterInfo &MRI,
-                                            MachineFunction &MF) const {
-
-  const unsigned DefReg = I.getOperand(0).getReg();
-  LLT Ty = MRI.getType(DefReg);
-  const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
-
-  unsigned NewOpc = I.getOpcode();
-
-  switch (NewOpc) {
-  case TargetOpcode::G_FADD:
-    NewOpc = getFAddOp(Ty, RB);
-    break;
-  case TargetOpcode::G_FSUB:
-    NewOpc = getFSubOp(Ty, RB);
-    break;
-  default:
-    break;
-  }
-
-  if (NewOpc == I.getOpcode())
-    return false;
-
-  I.setDesc(TII.get(NewOpc));
-
-  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
-}
-
 unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
                                                 unsigned Opc,
                                                 uint64_t Alignment) const {
@@ -562,6 +465,105 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
   return true;
 }
 
+bool X86InstructionSelector::selectZext(MachineInstr &I,
+                                        MachineRegisterInfo &MRI,
+                                        MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_ZEXT)
+    return false;
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned SrcReg = I.getOperand(1).getReg();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+
+  if (SrcTy == LLT::scalar(1)) {
+
+    unsigned AndOpc;
+    if (DstTy == LLT::scalar(32))
+      AndOpc = X86::AND32ri8;
+    else if (DstTy == LLT::scalar(64))
+      AndOpc = X86::AND64ri8;
+    else
+      return false;
+
+    const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+    unsigned DefReg =
+        MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank));
+
+    BuildMI(*I.getParent(), I, I.getDebugLoc(),
+            TII.get(TargetOpcode::SUBREG_TO_REG), DefReg)
+        .addImm(0)
+        .addReg(SrcReg)
+        .addImm(X86::sub_8bit);
+
+    MachineInstr &AndInst =
+        *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AndOpc), DstReg)
+             .addReg(DefReg)
+             .addImm(1);
+
+    constrainSelectedInstRegOperands(AndInst, TII, TRI, RBI);
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+bool X86InstructionSelector::selectCmp(MachineInstr &I,
+                                       MachineRegisterInfo &MRI,
+                                       MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_ICMP)
+    return false;
+
+  X86::CondCode CC;
+  bool SwapArgs;
+  std::tie(CC, SwapArgs) = X86::getX86ConditionCode(
+      (CmpInst::Predicate)I.getOperand(1).getPredicate());
+  unsigned OpSet = X86::getSETFromCond(CC);
+
+  unsigned LHS = I.getOperand(2).getReg();
+  unsigned RHS = I.getOperand(3).getReg();
+
+  if (SwapArgs)
+    std::swap(LHS, RHS);
+
+  unsigned OpCmp;
+  LLT Ty = MRI.getType(LHS);
+
+  switch (Ty.getSizeInBits()) {
+  default:
+    return false;
+  case 8:
+    OpCmp = X86::CMP8rr;
+    break;
+  case 16:
+    OpCmp = X86::CMP16rr;
+    break;
+  case 32:
+    OpCmp = X86::CMP32rr;
+    break;
+  case 64:
+    OpCmp = X86::CMP64rr;
+    break;
+  }
+
+  MachineInstr &CmpInst =
+      *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpCmp))
+           .addReg(LHS)
+           .addReg(RHS);
+
+  MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                   TII.get(OpSet), I.getOperand(0).getReg());
+
+  constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI);
+
+  I.eraseFromParent();
+  return true;
+}
+
 InstructionSelector *
 llvm::createX86InstructionSelector(const X86TargetMachine &TM,
                                    X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 2a40399ba571..bc73bb1ae8c5 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
   EXPAND_FROM_MEM,
   TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
-  FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
+  FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
 };
 
 struct IntrinsicData {
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 4f5e70414aa9..cf26238c0239 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -87,10 +87,16 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
   setAction({G_ZEXT, s32}, Legal);
   setAction({G_SEXT, s32}, Legal);
 
-  for (auto Ty : {s8, s16}) {
+  for (auto Ty : {s1, s8, s16}) {
     setAction({G_ZEXT, 1, Ty}, Legal);
     setAction({G_SEXT, 1, Ty}, Legal);
   }
+
+  // Comparison
+  setAction({G_ICMP, s1}, Legal);
+
+  for (auto Ty : {s8, s16, s32, p0})
+    setAction({G_ICMP, 1, Ty}, Legal);
 }
 
 void X86LegalizerInfo::setLegalizerInfo64bit() {
@@ -139,10 +145,16 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
     setAction({G_SEXT, Ty}, Legal);
   }
 
-  for (auto Ty : {s8, s16, s32}) {
+  for (auto Ty : {s1, s8, s16, s32}) {
     setAction({G_ZEXT, 1, Ty}, Legal);
     setAction({G_SEXT, 1, Ty}, Legal);
   }
+
+  // Comparison
+  setAction({G_ICMP, s1}, Legal);
+
+  for (auto Ty : {s8, s16, s32, s64, p0})
+    setAction({G_ICMP, 1, Ty}, Legal);
 }
 
 void X86LegalizerInfo::setLegalizerInfoSSE1() {
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index cf2ceef8013a..7e4cba1c8345 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -320,14 +320,14 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   case CallingConv::X86_RegCall:
     if (Is64Bit) {
       if (IsWin64) {
-        return (HasSSE ? CSR_Win64_RegCall_SaveList : 
+        return (HasSSE ? CSR_Win64_RegCall_SaveList :
                          CSR_Win64_RegCall_NoSSE_SaveList);
       } else {
-        return (HasSSE ? CSR_SysV64_RegCall_SaveList : 
+        return (HasSSE ? CSR_SysV64_RegCall_SaveList :
                          CSR_SysV64_RegCall_NoSSE_SaveList);
       }
     } else {
-      return (HasSSE ? CSR_32_RegCall_SaveList : 
+      return (HasSSE ? CSR_32_RegCall_SaveList :
                        CSR_32_RegCall_NoSSE_SaveList);
     }
   case CallingConv::Cold:
@@ -435,15 +435,15 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return CSR_64_HHVM_RegMask;
   case CallingConv::X86_RegCall:
     if (Is64Bit) {
-      if (IsWin64) { 
-        return (HasSSE ? CSR_Win64_RegCall_RegMask : 
+      if (IsWin64) {
+        return (HasSSE ? CSR_Win64_RegCall_RegMask :
                          CSR_Win64_RegCall_NoSSE_RegMask);
       } else {
-        return (HasSSE ? CSR_SysV64_RegCall_RegMask : 
+        return (HasSSE ? CSR_SysV64_RegCall_RegMask :
                          CSR_SysV64_RegCall_NoSSE_RegMask);
       }
     } else {
-      return (HasSSE ? CSR_32_RegCall_RegMask : 
+      return (HasSSE ? CSR_32_RegCall_RegMask :
                        CSR_32_RegCall_NoSSE_RegMask);
     }
   case CallingConv::Cold:
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index de1514243aeb..02be95e2e556 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -253,6 +253,11 @@ protected:
   /// True if the LEA instruction with certain arguments is slow
   bool SlowLEA;
 
+  /// True if the LEA instruction has all three source operands: base, index,
+  /// and offset or if the LEA instruction uses base and index registers where
+  /// the base is EBP, RBP,or R13
+  bool Slow3OpsLEA;
+
   /// True if INC and DEC instructions are slow when writing to flags
   bool SlowIncDec;
 
@@ -490,6 +495,7 @@ public:
   bool callRegIndirect() const { return CallRegIndirect; }
   bool LEAusesAG() const { return LEAUsesAG; }
   bool slowLEA() const { return SlowLEA; }
+  bool slow3OpsLEA() const { return Slow3OpsLEA; }
   bool slowIncDec() const { return SlowIncDec; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 086f55dd60b5..c6a90725d89c 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -61,6 +61,7 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
 namespace llvm {
 
 void initializeWinEHStatePassPass(PassRegistry &);
+void initializeFixupLEAPassPass(PassRegistry &);
 void initializeX86ExecutionDepsFixPass(PassRegistry &);
 
 } // end namespace llvm
@@ -75,6 +76,7 @@ extern "C" void LLVMInitializeX86Target() {
   initializeWinEHStatePassPass(PR);
   initializeFixupBWInstPassPass(PR);
   initializeEvexToVexInstPassPass(PR);
+  initializeFixupLEAPassPass(PR);
   initializeX86ExecutionDepsFixPass(PR);
 }
 
@@ -87,7 +89,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
 
   if (TT.isOSFreeBSD())
     return llvm::make_unique<X86FreeBSDTargetObjectFile>();
-  if (TT.isOSLinux() || TT.isOSNaCl())
+  if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
     return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
   if (TT.isOSFuchsia())
     return llvm::make_unique<X86FuchsiaTargetObjectFile>();
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index f3b619a2956a..80e18161a94b 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -247,35 +247,38 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry SSE2UniformConstCostTable[] = {
-    { ISD::SHL,  MVT::v16i8,   2 }, // psllw + pand.
-    { ISD::SRL,  MVT::v16i8,   2 }, // psrlw + pand.
-    { ISD::SRA,  MVT::v16i8,   4 }, // psrlw, pand, pxor, psubb.
-
-    { ISD::SHL,  MVT::v32i8,   4 }, // 2*(psllw + pand).
-    { ISD::SRL,  MVT::v32i8,   4 }, // 2*(psrlw + pand).
-    { ISD::SRA,  MVT::v32i8,   8 }, // 2*(psrlw, pand, pxor, psubb).
-
-    { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
-    { ISD::SDIV, MVT::v8i16,   6 }, // pmulhw sequence
-    { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
-    { ISD::UDIV, MVT::v8i16,   6 }, // pmulhuw sequence
-    { ISD::SDIV, MVT::v8i32,  38 }, // pmuludq sequence
-    { ISD::SDIV, MVT::v4i32,  19 }, // pmuludq sequence
-    { ISD::UDIV, MVT::v8i32,  30 }, // pmuludq sequence
-    { ISD::UDIV, MVT::v4i32,  15 }, // pmuludq sequence
+    { ISD::SHL,  MVT::v16i8,     2 }, // psllw + pand.
+    { ISD::SRL,  MVT::v16i8,     2 }, // psrlw + pand.
+    { ISD::SRA,  MVT::v16i8,     4 }, // psrlw, pand, pxor, psubb.
+
+    { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
+    { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
+    { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+
+    { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
+    { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
+    { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
+    { ISD::UDIV, MVT::v8i16,     6 }, // pmulhuw sequence
+    { ISD::SDIV, MVT::v8i32,  38+2 }, // 2*pmuludq sequence + split.
+    { ISD::SDIV, MVT::v4i32,    19 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v8i32,  30+2 }, // 2*pmuludq sequence + split.
+    { ISD::UDIV, MVT::v4i32,    15 }, // pmuludq sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       ST->hasSSE2()) {
     // pmuldq sequence.
     if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
-      return LT.first * 30;
+      return LT.first * 32;
     if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
       return LT.first * 15;
 
-    if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
-                                            LT.second))
-      return LT.first * Entry->Cost;
+    // XOP has faster vXi8 shifts.
+    if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
+        !ST->hasXOP())
+      if (const auto *Entry =
+              CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+        return LT.first * Entry->Cost;
   }
 
   static const CostTblEntry AVX2UniformCostTable[] = {
@@ -430,18 +433,18 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRL,     MVT::v2i64,    2 },
     { ISD::SRA,     MVT::v2i64,    2 },
     // 256bit shifts require splitting if AVX2 didn't catch them above.
-    { ISD::SHL,     MVT::v32i8,    2 },
-    { ISD::SRL,     MVT::v32i8,    4 },
-    { ISD::SRA,     MVT::v32i8,    4 },
-    { ISD::SHL,     MVT::v16i16,   2 },
-    { ISD::SRL,     MVT::v16i16,   4 },
-    { ISD::SRA,     MVT::v16i16,   4 },
-    { ISD::SHL,     MVT::v8i32,    2 },
-    { ISD::SRL,     MVT::v8i32,    4 },
-    { ISD::SRA,     MVT::v8i32,    4 },
-    { ISD::SHL,     MVT::v4i64,    2 },
-    { ISD::SRL,     MVT::v4i64,    4 },
-    { ISD::SRA,     MVT::v4i64,    4 },
+    { ISD::SHL,     MVT::v32i8,  2+2 },
+    { ISD::SRL,     MVT::v32i8,  4+2 },
+    { ISD::SRA,     MVT::v32i8,  4+2 },
+    { ISD::SHL,     MVT::v16i16, 2+2 },
+    { ISD::SRL,     MVT::v16i16, 4+2 },
+    { ISD::SRA,     MVT::v16i16, 4+2 },
+    { ISD::SHL,     MVT::v8i32,  2+2 },
+    { ISD::SRL,     MVT::v8i32,  4+2 },
+    { ISD::SRA,     MVT::v8i32,  4+2 },
+    { ISD::SHL,     MVT::v4i64,  2+2 },
+    { ISD::SRL,     MVT::v4i64,  4+2 },
+    { ISD::SRA,     MVT::v4i64,  4+2 },
   };
 
   // Look for XOP lowering tricks.
@@ -451,23 +454,28 @@ int X86TTIImpl::getArithmeticInstrCost(
 
   static const CostTblEntry SSE2UniformShiftCostTable[] = {
     // Uniform splats are cheaper for the following instructions.
-    { ISD::SHL,  MVT::v16i16, 2 }, // psllw.
-    { ISD::SHL,  MVT::v8i32,  2 }, // pslld
-    { ISD::SHL,  MVT::v4i64,  2 }, // psllq.
-
-    { ISD::SRL,  MVT::v16i16, 2 }, // psrlw.
-    { ISD::SRL,  MVT::v8i32,  2 }, // psrld.
-    { ISD::SRL,  MVT::v4i64,  2 }, // psrlq.
-
-    { ISD::SRA,  MVT::v16i16, 2 }, // psraw.
-    { ISD::SRA,  MVT::v8i32,  2 }, // psrad.
-    { ISD::SRA,  MVT::v2i64,  4 }, // 2 x psrad + shuffle.
-    { ISD::SRA,  MVT::v4i64,  8 }, // 2 x psrad + shuffle.
+    { ISD::SHL,  MVT::v16i16, 2+2 }, // 2*psllw + split.
+    { ISD::SHL,  MVT::v8i32,  2+2 }, // 2*pslld + split.
+    { ISD::SHL,  MVT::v4i64,  2+2 }, // 2*psllq + split.
+
+    { ISD::SRL,  MVT::v16i16, 2+2 }, // 2*psrlw + split.
+    { ISD::SRL,  MVT::v8i32,  2+2 }, // 2*psrld + split.
+    { ISD::SRL,  MVT::v4i64,  2+2 }, // 2*psrlq + split.
+
+    { ISD::SRA,  MVT::v16i16, 2+2 }, // 2*psraw + split.
+    { ISD::SRA,  MVT::v8i32,  2+2 }, // 2*psrad + split.
+    { ISD::SRA,  MVT::v2i64,    4 }, // 2*psrad + shuffle.
+    { ISD::SRA,  MVT::v4i64,  8+2 }, // 2*(2*psrad + shuffle) + split.
   };
 
   if (ST->hasSSE2() &&
       ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
        (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+
+    // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
+    if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
+      return LT.first * 4; // 2*psrad + shuffle.
+
     if (const auto *Entry =
             CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
@@ -581,28 +589,28 @@ int X86TTIImpl::getArithmeticInstrCost(
       return LT.first * Entry->Cost;
 
   static const CostTblEntry SSE41CostTable[] = {
-    { ISD::SHL,  MVT::v16i8,    11 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v32i8,  2*11 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v4i32,     4 }, // pslld/paddd/cvttps2dq/pmulld
-    { ISD::SHL,  MVT::v8i32,   2*4 }, // pslld/paddd/cvttps2dq/pmulld
-
-    { ISD::SRL,  MVT::v16i8,    12 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v32i8,  2*12 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v4i32,    11 }, // Shift each lane + blend.
-    { ISD::SRL,  MVT::v8i32,  2*11 }, // Shift each lane + blend.
-
-    { ISD::SRA,  MVT::v16i8,    24 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v32i8,  2*24 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v4i32,    12 }, // Shift each lane + blend.
-    { ISD::SRA,  MVT::v8i32,  2*12 }, // Shift each lane + blend.
-
-    { ISD::MUL,  MVT::v4i32,     1 }  // pmulld
+    { ISD::SHL,  MVT::v16i8,      11 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v32i8,  2*11+2 }, // pblendvb sequence + split.
+    { ISD::SHL,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SHL,  MVT::v4i32,       4 }, // pslld/paddd/cvttps2dq/pmulld
+    { ISD::SHL,  MVT::v8i32,   2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
+
+    { ISD::SRL,  MVT::v16i8,      12 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v32i8,  2*12+2 }, // pblendvb sequence + split.
+    { ISD::SRL,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SRL,  MVT::v4i32,      11 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v8i32,  2*11+2 }, // Shift each lane + blend + split.
+
+    { ISD::SRA,  MVT::v16i8,      24 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v32i8,  2*24+2 }, // pblendvb sequence + split.
+    { ISD::SRA,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SRA,  MVT::v4i32,      12 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v8i32,  2*12+2 }, // Shift each lane + blend + split.
+
+    { ISD::MUL,  MVT::v4i32,       1 }  // pmulld
   };
 
   if (ST->hasSSE41())
@@ -612,33 +620,33 @@ int X86TTIImpl::getArithmeticInstrCost(
   static const CostTblEntry SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
-    { ISD::SHL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v4i32,   2*5 }, // We optimized this using mul.
-    { ISD::SHL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
-    { ISD::SHL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
-
-    { ISD::SRL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
-    { ISD::SRL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
-
-    { ISD::SRA,  MVT::v16i8,    54 }, // unpacked cmpgtb sequence.
-    { ISD::SRA,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRA,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRA,  MVT::v2i64,    12 }, // srl/xor/sub sequence.
-    { ISD::SRA,  MVT::v4i64,  2*12 }, // srl/xor/sub sequence.
-
-    { ISD::MUL,  MVT::v16i8,    12 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,  MVT::v8i16,     1 }, // pmullw
-    { ISD::MUL,  MVT::v4i32,     6 }, // 3*pmuludq/4*shuffle
-    { ISD::MUL,  MVT::v2i64,     8 }, // 3*pmuludq/3*shift/2*add
-
-    { ISD::FDIV, MVT::f32,      23 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::v4f32,    39 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::f64,      38 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::v2f64,    69 }, // Pentium IV from http://www.agner.org/
+    { ISD::SHL,  MVT::v16i8,      26 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v4i32,     2*5 }, // We optimized this using mul.
+    { ISD::SHL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
+    { ISD::SHL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
+
+    { ISD::SRL,  MVT::v16i8,      26 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v4i32,      16 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
+    { ISD::SRL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
+
+    { ISD::SRA,  MVT::v16i8,      54 }, // unpacked cmpgtb sequence.
+    { ISD::SRA,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SRA,  MVT::v4i32,      16 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v2i64,      12 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,  2*12+2 }, // srl/xor/sub sequence+split.
+
+    { ISD::MUL,  MVT::v16i8,      12 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v8i16,       1 }, // pmullw
+    { ISD::MUL,  MVT::v4i32,       6 }, // 3*pmuludq/4*shuffle
+    { ISD::MUL,  MVT::v2i64,       8 }, // 3*pmuludq/3*shift/2*add
+
+    { ISD::FDIV, MVT::f32,        23 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f32,      39 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::f64,        38 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::v2f64,      69 }, // Pentium IV from http://www.agner.org/
 
     // It is not a good idea to vectorize division. We have to scalarize it and
     // in the process we will often end up having to spilling regular
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 500b26b3be17..3ee14a0ff7b1 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -398,7 +398,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
                         /*isVarArg=*/false);
   Function *Trampoline =
       Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
-                       Twine("__ehhandler$") + GlobalValue::getRealLinkageName(
+                       Twine("__ehhandler$") + GlobalValue::dropLLVMManglingEscape(
                                                    ParentFunc->getName()),
                        TheModule);
   BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index b8742683a0c8..1da189c5cd31 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -409,7 +409,7 @@ static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
 {
   KnownBits Known;
   DAG.computeKnownBits(Value, Known);
-  return Known.Zero.countTrailingOnes() >= 2;
+  return Known.countMinTrailingZeros() >= 2;
 }
 
 SDValue XCoreTargetLowering::
@@ -1131,8 +1131,7 @@ SDValue XCoreTargetLowering::LowerCCCCallTo(
   unsigned NumBytes = RetCCInfo.getNextStackOffset();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  Chain = DAG.getCALLSEQ_START(Chain,
-                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index f1d52d5a191f..b87ba6548962 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -73,9 +73,10 @@ def XCoreLdwsp        : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp,
                                [SDNPHasChain, SDNPMayLoad]>;
 
 // These are target-independent nodes, but have target-specific formats.
-def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i32> ]>;
 def SDT_XCoreCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
-                                        SDTCisVT<1, i32> ]>;
+                                           SDTCisVT<1, i32> ]>;
 
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
                            [SDNPHasChain, SDNPOutGlue]>;
@@ -323,9 +324,9 @@ class F2R_np<bits<6> opc, string OpcStr> :
 //===----------------------------------------------------------------------===//
 
 let Defs = [SP], Uses = [SP] in {
-def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
-                               "# ADJCALLSTACKDOWN $amt",
-                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt, i32imm:$amt2),
+                               "# ADJCALLSTACKDOWN $amt, $amt2",
+                               [(callseq_start timm:$amt, timm:$amt2)]>;
 def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             "# ADJCALLSTACKUP $amt1",
                             [(callseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/ToolDrivers/CMakeLists.txt b/lib/ToolDrivers/CMakeLists.txt
new file mode 100644
index 000000000000..ad458450fda3
--- /dev/null
+++ b/lib/ToolDrivers/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(llvm-lib)
diff --git a/lib/ToolDrivers/LLVMBuild.txt b/lib/ToolDrivers/LLVMBuild.txt
new file mode 100644
index 000000000000..7da9a5c01005
--- /dev/null
+++ b/lib/ToolDrivers/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/ToolDrivers/LLVMBuild.txt --------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = llvm-lib
+
+[component_0]
+type = Group
+name = ToolDrivers
+parent = Libraries
diff --git a/lib/LibDriver/CMakeLists.txt b/lib/ToolDrivers/llvm-lib/CMakeLists.txt
index ab53a6843446..ab53a6843446 100644
--- a/lib/LibDriver/CMakeLists.txt
+++ b/lib/ToolDrivers/llvm-lib/CMakeLists.txt
diff --git a/lib/LibDriver/LLVMBuild.txt b/lib/ToolDrivers/llvm-lib/LLVMBuild.txt
index 799dc997c0bb..799dc997c0bb 100644
--- a/lib/LibDriver/LLVMBuild.txt
+++ b/lib/ToolDrivers/llvm-lib/LLVMBuild.txt
diff --git a/lib/LibDriver/LibDriver.cpp b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index c50629d71501..3bae3826d62e 100644
--- a/lib/LibDriver/LibDriver.cpp
+++ b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/LibDriver/LibDriver.h"
+#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Option/Arg.h"
diff --git a/lib/LibDriver/Options.td b/lib/ToolDrivers/llvm-lib/Options.td
index 5a56ef7468d4..5a56ef7468d4 100644
--- a/lib/LibDriver/Options.td
+++ b/lib/ToolDrivers/llvm-lib/Options.td
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index 19e6789dfa74..4480220f2cd4 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -177,7 +177,7 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
   // consume. Note, that crossing coro.save also requires a spill, as any code
   // between coro.save and coro.suspend may resume the coroutine and all of the
   // state needs to be saved by that time.
-  auto markSuspendBlock = [&](IntrinsicInst* BarrierInst) {
+  auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) {
     BasicBlock *SuspendBlock = BarrierInst->getParent();
     auto &B = getBlockData(SuspendBlock);
     B.Suspend = true;
@@ -495,6 +495,78 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   return FramePtr;
 }
 
+// Sets the unwind edge of an instruction to a particular successor.
+static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) {
+  if (auto *II = dyn_cast<InvokeInst>(TI))
+    II->setUnwindDest(Succ);
+  else if (auto *CS = dyn_cast<CatchSwitchInst>(TI))
+    CS->setUnwindDest(Succ);
+  else if (auto *CR = dyn_cast<CleanupReturnInst>(TI))
+    CR->setUnwindDest(Succ);
+  else
+    llvm_unreachable("unexpected terminator instruction");
+}
+
+// Replaces all uses of OldPred with the NewPred block in all PHINodes in a
+// block.
+static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred,
+                           BasicBlock *NewPred,
+                           PHINode *LandingPadReplacement) {
+  unsigned BBIdx = 0;
+  for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // We manually update the LandingPadReplacement PHINode and it is the last
+    // PHI Node. So, if we find it, we are done.
+    if (LandingPadReplacement == PN)
+      break;
+
+    // Reuse the previous value of BBIdx if it lines up.  In cases where we
+    // have multiple phi nodes with *lots* of predecessors, this is a speed
+    // win because we don't have to scan the PHI looking for TIBB.  This
+    // happens because the BB list of PHI nodes are usually in the same
+    // order.
+    if (PN->getIncomingBlock(BBIdx) != OldPred)
+      BBIdx = PN->getBasicBlockIndex(OldPred);
+
+    assert(BBIdx != (unsigned)-1 && "Invalid PHI Index!");
+    PN->setIncomingBlock(BBIdx, NewPred);
+  }
+}
+
+// Uses SplitEdge unless the successor block is an EHPad, in which case do EH
+// specific handling.
+static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
+                                    LandingPadInst *OriginalPad,
+                                    PHINode *LandingPadReplacement) {
+  auto *PadInst = Succ->getFirstNonPHI();
+  if (!LandingPadReplacement && !PadInst->isEHPad())
+    return SplitEdge(BB, Succ);
+
+  auto *NewBB = BasicBlock::Create(BB->getContext(), "", BB->getParent(), Succ);
+  setUnwindEdgeTo(BB->getTerminator(), NewBB);
+  updatePhiNodes(Succ, BB, NewBB, LandingPadReplacement);
+
+  if (LandingPadReplacement) {
+    auto *NewLP = OriginalPad->clone();
+    auto *Terminator = BranchInst::Create(Succ, NewBB);
+    NewLP->insertBefore(Terminator);
+    LandingPadReplacement->addIncoming(NewLP, NewBB);
+    return NewBB;
+  }
+  Value *ParentPad = nullptr;
+  if (auto *FuncletPad = dyn_cast<FuncletPadInst>(PadInst))
+    ParentPad = FuncletPad->getParentPad();
+  else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(PadInst))
+    ParentPad = CatchSwitch->getParentPad();
+  else
+    llvm_unreachable("handling for other EHPads not implemented yet");
+
+  auto *NewCleanupPad = CleanupPadInst::Create(ParentPad, {}, "", NewBB);
+  CleanupReturnInst::Create(NewCleanupPad, Succ, NewBB);
+  return NewBB;
+}
+
 static void rewritePHIs(BasicBlock &BB) {
   // For every incoming edge we will create a block holding all
   // incoming values in a single PHI nodes.
@@ -502,7 +574,7 @@ static void rewritePHIs(BasicBlock &BB) {
   // loop:
   //    %n.val = phi i32[%n, %entry], [%inc, %loop]
   //
-  // It will create:  
+  // It will create:
   //
   // loop.from.entry:
   //    %n.loop.pre = phi i32 [%n, %entry]
@@ -517,9 +589,22 @@ static void rewritePHIs(BasicBlock &BB) {
   // TODO: Simplify PHINodes in the basic block to remove duplicate
   // predecessors.
 
+  LandingPadInst *LandingPad = nullptr;
+  PHINode *ReplPHI = nullptr;
+  if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
+    // ehAwareSplitEdge will clone the LandingPad in all the edge blocks.
+    // We replace the original landing pad with a PHINode that will collect the
+    // results from all of them.
+    ReplPHI = PHINode::Create(LandingPad->getType(), 1, "", LandingPad);
+    ReplPHI->takeName(LandingPad);
+    LandingPad->replaceAllUsesWith(ReplPHI);
+    // We will erase the original landing pad at the end of this function after
+    // ehAwareSplitEdge cloned it in the transition blocks.
+  }
+
   SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB));
   for (BasicBlock *Pred : Preds) {
-    auto *IncomingBB = SplitEdge(Pred, &BB);
+    auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);
     IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
     auto *PN = cast<PHINode>(&BB.front());
     do {
@@ -531,7 +616,14 @@ static void rewritePHIs(BasicBlock &BB) {
       InputV->addIncoming(V, Pred);
       PN->setIncomingValue(Index, InputV);
       PN = dyn_cast<PHINode>(PN->getNextNode());
-    } while (PN);
+    } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced
+                             // the landing pad.
+  }
+
+  if (LandingPad) {
+    // Calls to ehAwareSplitEdge function cloned the original lading pad.
+    // No longer need it.
+    LandingPad->eraseFromParent();
   }
 }
 
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 7ed07d63c627..231487923fad 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -610,8 +610,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
       return true;
 
     // Lookup the linkage recorded in the summaries during global analysis.
-    const auto &GS = DefinedGlobals.find(GV.getGUID());
-    GlobalValue::LinkageTypes Linkage;
+    auto GS = DefinedGlobals.find(GV.getGUID());
     if (GS == DefinedGlobals.end()) {
       // Must have been promoted (possibly conservatively). Find original
       // name so that we can access the correct summary and see if it can
@@ -623,7 +622,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
       std::string OrigId = GlobalValue::getGlobalIdentifier(
           OrigName, GlobalValue::InternalLinkage,
           TheModule.getSourceFileName());
-      const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
+      GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
       if (GS == DefinedGlobals.end()) {
         // Also check the original non-promoted non-globalized name. In some
         // cases a preempted weak value is linked in as a local copy because
@@ -631,15 +630,11 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
         // In that case, since it was originally not a local value, it was
         // recorded in the index using the original name.
         // FIXME: This may not be needed once PR27866 is fixed.
-        const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
+        GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
         assert(GS != DefinedGlobals.end());
-        Linkage = GS->second->linkage();
-      } else {
-        Linkage = GS->second->linkage();
       }
-    } else
-      Linkage = GS->second->linkage();
-    return !GlobalValue::isLocalLinkage(Linkage);
+    }
+    return !GlobalValue::isLocalLinkage(GS->second->linkage());
   };
 
   // FIXME: See if we can just internalize directly here via linkage changes
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 6c83c99ae3be..673d3af0ab52 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -502,7 +502,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
         std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
 
   InlinedArrayAllocasTy InlinedArrayAllocas;
-  InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache);
+  InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI);
 
   // Now that we have all of the call sites, loop over them and inline them if
   // it looks profitable to do so.
@@ -872,7 +872,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       // Setup the data structure used to plumb customization into the
       // `InlineFunction` routine.
       InlineFunctionInfo IFI(
-          /*cg=*/nullptr, &GetAssumptionCache,
+          /*cg=*/nullptr, &GetAssumptionCache, PSI,
           &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),
           &FAM.getResult<BlockFrequencyAnalysis>(Callee));
 
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 2db47b3b5622..8dff2fb3be8a 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -42,6 +43,11 @@ STATISTIC(NumPartialInlined,
 static cl::opt<bool>
     DisablePartialInlining("disable-partial-inlining", cl::init(false),
                            cl::Hidden, cl::desc("Disable partial ininling"));
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+                                      cl::init(false), cl::ZeroOrMore,
+                                      cl::ReallyHidden,
+                                      cl::desc("Skip Cost Analysis"));
 
 static cl::opt<unsigned> MaxNumInlineBlocks(
     "max-num-inline-blocks", cl::init(5), cl::Hidden,
@@ -53,6 +59,15 @@ static cl::opt<int> MaxNumPartialInlining(
     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
     cl::desc("Max number of partial inlining. The default is unlimited"));
 
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+    OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+                             cl::Hidden, cl::ZeroOrMore,
+                             cl::desc("Relative frequency of outline region to "
+                                      "the entry block"));
+
 namespace {
 
 struct FunctionOutliningInfo {
@@ -84,8 +99,6 @@ struct PartialInlinerImpl {
   bool run(Module &M);
   Function *unswitchFunction(Function *F);
 
-  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
-
 private:
   int NumPartialInlining = 0;
   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
@@ -93,11 +106,84 @@ private:
   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
   ProfileSummaryInfo *PSI;
 
-  bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE);
+  // Return the frequency of the OutlininingBB relative to F's entry point.
+  // The result is no larger than 1 and is represented using BP.
+  // (Note that the outlined region's 'head' block can only have incoming
+  // edges from the guarding entry blocks).
+  BranchProbability getOutliningCallBBRelativeFreq(Function *F,
+                                                   FunctionOutliningInfo *OI,
+                                                   Function *DuplicateFunction,
+                                                   BlockFrequencyInfo *BFI,
+                                                   BasicBlock *OutliningCallBB);
+
+  // Return true if the callee of CS should be partially inlined with
+  // profit.
+  bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI,
+                           BlockFrequencyInfo *CalleeBFI,
+                           BasicBlock *OutliningCallBB,
+                           int OutliningCallOverhead,
+                           OptimizationRemarkEmitter &ORE);
+
+  // Try to inline DuplicateFunction (cloned from F with call to
+  // the OutlinedFunction into its callers. Return true
+  // if there is any successful inlining.
+  bool tryPartialInline(Function *DuplicateFunction,
+                        Function *F, /*orignal function */
+                        FunctionOutliningInfo *OI, Function *OutlinedFunction,
+                        BlockFrequencyInfo *CalleeBFI);
+
+  // Compute the mapping from use site of DuplicationFunction to the enclosing
+  // BB's profile count.
+  void computeCallsiteToProfCountMap(Function *DuplicateFunction,
+                                     DenseMap<User *, uint64_t> &SiteCountMap);
+
   bool IsLimitReached() {
     return (MaxNumPartialInlining != -1 &&
             NumPartialInlining >= MaxNumPartialInlining);
   }
+
+  CallSite getCallSite(User *U) {
+    CallSite CS;
+    if (CallInst *CI = dyn_cast<CallInst>(U))
+      CS = CallSite(CI);
+    else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+      CS = CallSite(II);
+    else
+      llvm_unreachable("All uses must be calls");
+    return CS;
+  }
+
+  CallSite getOneCallSiteTo(Function *F) {
+    User *User = *F->user_begin();
+    return getCallSite(User);
+  }
+
+  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+    CallSite CS = getOneCallSiteTo(F);
+    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+    BasicBlock *Block = CS.getParent();
+    return std::make_tuple(DLoc, Block);
+  }
+
+  // Returns the costs associated with function outlining:
+  // - The first value is the non-weighted runtime cost for making the call
+  //   to the outlined function 'OutlinedFunction', including the addtional
+  //   setup cost in the outlined function itself;
+  // - The second value is the estimated size of the new call sequence in
+  //   basic block 'OutliningCallBB';
+  // - The third value is the estimated size of the original code from
+  //   function 'F' that is extracted into the outlined function.
+  std::tuple<int, int, int>
+  computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo,
+                        Function *OutlinedFunction,
+                        BasicBlock *OutliningCallBB);
+  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+  // approximate both the size and runtime cost (Note that in the current
+  // inline cost analysis, there is no clear distinction there either).
+  int computeBBInlineCost(BasicBlock *BB);
+
+  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
+
 };
 
 struct PartialInlinerLegacyPass : public ModulePass {
@@ -157,7 +243,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
     return isa<ReturnInst>(TI);
   };
 
-  auto GetReturnBlock = [=](BasicBlock *Succ1, BasicBlock *Succ2) {
+  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
     if (IsReturnBlock(Succ1))
       return std::make_tuple(Succ1, Succ2);
     if (IsReturnBlock(Succ2))
@@ -167,7 +253,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   };
 
   // Detect a triangular shape:
-  auto GetCommonSucc = [=](BasicBlock *Succ1, BasicBlock *Succ2) {
+  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
     if (IsSuccessor(Succ1, Succ2))
       return std::make_tuple(Succ1, Succ2);
     if (IsSuccessor(Succ2, Succ1))
@@ -223,7 +309,8 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   // Do sanity check of the entries: threre should not
   // be any successors (not in the entry set) other than
   // {ReturnBlock, NonReturnBlock}
-  assert(OutliningInfo->Entries[0] == &F->front());
+  assert(OutliningInfo->Entries[0] == &F->front() &&
+         "Function Entry must be the first in Entries vector");
   DenseSet<BasicBlock *> Entries;
   for (BasicBlock *E : OutliningInfo->Entries)
     Entries.insert(E);
@@ -289,10 +376,54 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   return OutliningInfo;
 }
 
-bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
-                                             OptimizationRemarkEmitter &ORE) {
-  // TODO : more sharing with shouldInline in Inliner.cpp
+// Check if there is PGO data or user annoated branch data:
+static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
+  if (F->getEntryCount())
+    return true;
+  // Now check if any of the entry block has MD_prof data:
+  for (auto *E : OI->Entries) {
+    BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+    if (!BR || BR->isUnconditional())
+      continue;
+    uint64_t T, F;
+    if (BR->extractProfMetadata(T, F))
+      return true;
+  }
+  return false;
+}
+
+BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
+    Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction,
+    BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) {
+
+  auto EntryFreq =
+      BFI->getBlockFreq(&DuplicateFunction->getEntryBlock());
+  auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB);
+
+  auto OutlineRegionRelFreq =
+      BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
+                                              EntryFreq.getFrequency());
+
+  if (hasProfileData(F, OI))
+    return OutlineRegionRelFreq;
+
+  // When profile data is not available, we need to be very
+  // conservative in estimating the overall savings. We need to make sure
+  // the outline region relative frequency is not below the threshold
+  // specified by the option.
+  OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+  return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+    CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI,
+    BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB,
+    int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) {
   using namespace ore;
+  if (SkipCostAnalysis)
+    return true;
+
   Instruction *Call = CS.getInstruction();
   Function *Callee = CS.getCalledFunction();
   Function *Caller = CS.getCaller();
@@ -302,36 +433,170 @@ bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
 
   if (IC.isAlways()) {
     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
-             << NV("Callee", Callee)
+             << NV("Callee", F)
              << " should always be fully inlined, not partially");
     return false;
   }
 
   if (IC.isNever()) {
     ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
-             << NV("Callee", Callee) << " not partially inlined into "
+             << NV("Callee", F) << " not partially inlined into "
              << NV("Caller", Caller)
              << " because it should never be inlined (cost=never)");
     return false;
   }
 
   if (!IC) {
-    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
-             << NV("Callee", Callee) << " not partially inlined into "
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+             << NV("Callee", F) << " not partially inlined into "
              << NV("Caller", Caller) << " because too costly to inline (cost="
              << NV("Cost", IC.getCost()) << ", threshold="
              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
     return false;
   }
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
+  // The savings of eliminating the call:
+  int NonWeightedSavings = getCallsiteCost(CS, DL);
+  BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+  auto RelativeFreq =
+      getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB);
+  auto NormWeightedRcost =
+      BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq;
+
+  // Weighted saving is smaller than weighted cost, return false
+  if (NormWeightedSavings < NormWeightedRcost) {
+    ORE.emit(
+        OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
+        << NV("Callee", F) << " not partially inlined into "
+        << NV("Caller", Caller) << " runtime overhead (overhead="
+        << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency())
+        << ", savings="
+        << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
+        << " of making the outlined call is too high");
+
+    return false;
+  }
 
   ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
-           << NV("Callee", Callee) << " can be partially inlined into "
+           << NV("Callee", F) << " can be partially inlined into "
            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
            << " (threshold="
            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
   return true;
 }
 
+// TODO: Ideally  we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+  int InlineCost = 0;
+  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      InlineCost += getCallsiteCost(CallSite(CI), DL);
+      continue;
+    }
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+      InlineCost += getCallsiteCost(CallSite(II), DL);
+      continue;
+    }
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+      InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+      continue;
+    }
+    InlineCost += InlineConstants::InstrCost;
+  }
+  return InlineCost;
+}
+
+std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
+    Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
+    BasicBlock *OutliningCallBB) {
+  // First compute the cost of the outlined region 'OI' in the original
+  // function 'F':
+  int OutlinedRegionCost = 0;
+  for (BasicBlock &BB : *F) {
+    if (&BB != OI->ReturnBlock &&
+        // Assuming Entry set is small -- do a linear search here:
+        std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
+            OI->Entries.end()) {
+      OutlinedRegionCost += computeBBInlineCost(&BB);
+    }
+  }
+
+  // Now compute the cost of the call sequence to the outlined function
+  // 'OutlinedFunction' in BB 'OutliningCallBB':
+  int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB);
+
+  // Now compute the cost of the extracted/outlined function itself:
+  int OutlinedFunctionCost = 0;
+  for (BasicBlock &BB : *OutlinedFunction) {
+    OutlinedFunctionCost += computeBBInlineCost(&BB);
+  }
+
+  assert(OutlinedFunctionCost >= OutlinedRegionCost &&
+         "Outlined function cost should be no less than the outlined region");
+  int OutliningRuntimeOverhead =
+      OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost);
+
+  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
+                         OutlinedRegionCost);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+    Function *DuplicateFunction,
+    DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
+  std::vector<User *> Users(DuplicateFunction->user_begin(),
+                            DuplicateFunction->user_end());
+  Function *CurrentCaller = nullptr;
+  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+  auto ComputeCurrBFI = [&,this](Function *Caller) {
+      // For the old pass manager:
+      if (!GetBFI) {
+        if (CurrentCallerBFI)
+          delete CurrentCallerBFI;
+        DominatorTree DT(*Caller);
+        LoopInfo LI(DT);
+        BranchProbabilityInfo BPI(*Caller, LI);
+        CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI);
+      } else {
+        // New pass manager:
+        CurrentCallerBFI = &(*GetBFI)(*Caller);
+      }
+  };
+
+  for (User *User : Users) {
+    CallSite CS = getCallSite(User);
+    Function *Caller = CS.getCaller();
+    if (CurrentCaller != Caller) {
+      CurrentCaller = Caller;
+      ComputeCurrBFI(Caller);
+    } else {
+      assert(CurrentCallerBFI && "CallerBFI is not set");
+    }
+    BasicBlock *CallBB = CS.getInstruction()->getParent();
+    auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+    if (Count)
+      CallSiteToProfCountMap[User] = *Count;
+    else
+      CallSiteToProfCountMap[User] = 0;
+  }
+  if (!GetBFI) {
+    if (CurrentCallerBFI)
+      delete CurrentCallerBFI;
+  }
+}
+
 Function *PartialInlinerImpl::unswitchFunction(Function *F) {
 
   if (F->hasAddressTaken())
@@ -347,21 +612,21 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
   if (PSI->isFunctionEntryCold(F))
     return nullptr;
 
-  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
-      computeOutliningInfo(F);
+  if (F->user_begin() == F->user_end())
+    return nullptr;
+
+  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
 
-  if (!OutliningInfo)
+  if (!OI)
     return nullptr;
 
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
   Function *DuplicateFunction = CloneFunction(F, VMap);
-  BasicBlock *NewReturnBlock =
-      cast<BasicBlock>(VMap[OutliningInfo->ReturnBlock]);
-  BasicBlock *NewNonReturnBlock =
-      cast<BasicBlock>(VMap[OutliningInfo->NonReturnBlock]);
+  BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+  BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
   DenseSet<BasicBlock *> NewEntries;
-  for (BasicBlock *BB : OutliningInfo->Entries) {
+  for (BasicBlock *BB : OI->Entries) {
     NewEntries.insert(cast<BasicBlock>(VMap[BB]));
   }
 
@@ -390,7 +655,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
   BasicBlock *PreReturn = NewReturnBlock;
   // only split block when necessary:
   PHINode *FirstPhi = getFirstPHI(PreReturn);
-  unsigned NumPredsFromEntries = OutliningInfo->ReturnBlockPreds.size();
+  unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size();
   if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) {
 
     NewReturnBlock = NewReturnBlock->splitBasicBlock(
@@ -408,14 +673,14 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
       Ins = NewReturnBlock->getFirstNonPHI();
 
       RetPhi->addIncoming(&*I, PreReturn);
-      for (BasicBlock *E : OutliningInfo->ReturnBlockPreds) {
+      for (BasicBlock *E : OI->ReturnBlockPreds) {
         BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
         RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE);
         OldPhi->removeIncomingValue(NewE);
       }
       ++I;
     }
-    for (auto E : OutliningInfo->ReturnBlockPreds) {
+    for (auto E : OI->ReturnBlockPreds) {
       BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
       NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
     }
@@ -423,7 +688,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
 
   // Returns true if the block is to be partial inlined into the caller
   // (i.e. not to be extracted to the out of line function)
-  auto ToBeInlined = [=](BasicBlock *BB) {
+  auto ToBeInlined = [&](BasicBlock *BB) {
     return BB == NewReturnBlock || NewEntries.count(BB);
   };
   // Gather up the blocks that we're going to extract.
@@ -443,50 +708,113 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
   BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
 
   // Extract the body of the if.
-  Function *ExtractedFunction =
+  Function *OutlinedFunction =
       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
           .extractCodeRegion();
 
-  // Inline the top-level if test into all callers.
+  bool AnyInline =
+      tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI);
+
+  // Ditch the duplicate, since we're done with it, and rewrite all remaining
+  // users (function pointers, etc.) back to the original function.
+  DuplicateFunction->replaceAllUsesWith(F);
+  DuplicateFunction->eraseFromParent();
+
+  if (AnyInline)
+    return OutlinedFunction;
+
+  // Remove the function that is speculatively created:
+  if (OutlinedFunction)
+    OutlinedFunction->eraseFromParent();
+
+  return nullptr;
+}
+
+bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
+                                          Function *F,
+                                          FunctionOutliningInfo *OI,
+                                          Function *OutlinedFunction,
+                                          BlockFrequencyInfo *CalleeBFI) {
+  if (OutlinedFunction == nullptr)
+    return false;
+
+  int NonWeightedRcost;
+  int SizeCost;
+  int OutlinedRegionSizeCost;
+
+  auto OutliningCallBB =
+      getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent();
+
+  std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) =
+      computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB);
+
+  // The call sequence to the outlined function is larger than the original
+  // outlined region size, it does not increase the chances of inlining
+  // 'F' with outlining (The inliner usies the size increase to model the
+  // the cost of inlining a callee).
+  if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) {
+    OptimizationRemarkEmitter ORE(F);
+    DebugLoc DLoc;
+    BasicBlock *Block;
+    std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction);
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+                                        DLoc, Block)
+             << ore::NV("Function", F)
+             << " not partially inlined into callers (Original Size = "
+             << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost)
+             << ", Size of call sequence to outlined function = "
+             << ore::NV("NewSize", SizeCost) << ")");
+    return false;
+  }
+
+  assert(F->user_begin() == F->user_end() &&
+         "F's users should all be replaced!");
   std::vector<User *> Users(DuplicateFunction->user_begin(),
                             DuplicateFunction->user_end());
 
+  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+  if (F->getEntryCount())
+    computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap);
+
+  auto CalleeEntryCount = F->getEntryCount();
+  uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+  bool AnyInline = false;
   for (User *User : Users) {
-    CallSite CS;
-    if (CallInst *CI = dyn_cast<CallInst>(User))
-      CS = CallSite(CI);
-    else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
-      CS = CallSite(II);
-    else
-      llvm_unreachable("All uses must be calls");
+    CallSite CS = getCallSite(User);
 
     if (IsLimitReached())
       continue;
 
     OptimizationRemarkEmitter ORE(CS.getCaller());
-    if (!shouldPartialInline(CS, ORE))
+
+    if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB,
+                             NonWeightedRcost, ORE))
       continue;
 
-    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
-    BasicBlock *Block = CS.getParent();
-    ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block)
-             << ore::NV("Callee", F) << " partially inlined into "
-             << ore::NV("Caller", CS.getCaller()));
+    ORE.emit(
+        OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
+        << ore::NV("Callee", F) << " partially inlined into "
+        << ore::NV("Caller", CS.getCaller()));
 
-    InlineFunctionInfo IFI(nullptr, GetAssumptionCache);
+    InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
     InlineFunction(CS, IFI);
+
+    // Now update the entry count:
+    if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+      uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+      CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+    }
+
+    AnyInline = true;
     NumPartialInlining++;
-    // update stats
+    // Update the stats
     NumPartialInlined++;
   }
 
-  // Ditch the duplicate, since we're done with it, and rewrite all remaining
-  // users (function pointers, etc.) back to the original function.
-  DuplicateFunction->replaceAllUsesWith(F);
-  DuplicateFunction->eraseFromParent();
-
+  if (AnyInline && CalleeEntryCount)
+    F->setEntryCount(CalleeEntryCountV);
 
-  return ExtractedFunction;
+  return AnyInline;
 }
 
 bool PartialInlinerImpl::run(Module &M) {
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index d3a3c24ce7b4..659cb9df00a2 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Constants.h"
@@ -178,7 +179,7 @@ void filterModule(
     else
       GO = new GlobalVariable(
           *M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
-          (Constant *)nullptr, "", (GlobalVariable *)nullptr,
+          nullptr, "", nullptr,
           GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
     GO->takeName(GA);
     GA->replaceAllUsesWith(GO);
@@ -320,7 +321,8 @@ void splitAndWriteThinLTOBitcode(
 
 
   // FIXME: Try to re-use BSI and PFI from the original module here.
-  ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr);
+  ProfileSummaryInfo PSI(M);
+  ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
 
   SmallVector<char, 0> Buffer;
 
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 153a186d5ed4..0ca62b7ae40c 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -847,92 +847,6 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
   return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
 }
 
-/// \brief Return true if we can prove that adding the two values of the
-/// knownbits will not overflow.
-/// Otherwise return false.
-static bool checkRippleForAdd(const KnownBits &LHSKnown,
-                              const KnownBits &RHSKnown) {
-  // Addition of two 2's complement numbers having opposite signs will never
-  // overflow.
-  if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
-      (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
-    return true;
-
-  // If either of the values is known to be non-negative, adding them can only
-  // overflow if the second is also non-negative, so we can assume that.
-  // Two non-negative numbers will only overflow if there is a carry to the 
-  // sign bit, so we can check if even when the values are as big as possible
-  // there is no overflow to the sign bit.
-  if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
-    APInt MaxLHS = ~LHSKnown.Zero;
-    MaxLHS.clearSignBit();
-    APInt MaxRHS = ~RHSKnown.Zero;
-    MaxRHS.clearSignBit();
-    APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
-    return Result.isSignBitClear();
-  }
-
-  // If either of the values is known to be negative, adding them can only
-  // overflow if the second is also negative, so we can assume that.
-  // Two negative number will only overflow if there is no carry to the sign
-  // bit, so we can check if even when the values are as small as possible
-  // there is overflow to the sign bit.
-  if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
-    APInt MinLHS = LHSKnown.One;
-    MinLHS.clearSignBit();
-    APInt MinRHS = RHSKnown.One;
-    MinRHS.clearSignBit();
-    APInt Result = std::move(MinLHS) + std::move(MinRHS);
-    return Result.isSignBitSet();
-  }
-
-  // If we reached here it means that we know nothing about the sign bits.
-  // In this case we can't know if there will be an overflow, since by 
-  // changing the sign bits any two values can be made to overflow.
-  return false;
-}
-
-/// Return true if we can prove that:
-///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))
-/// This basically requires proving that the add in the original type would not
-/// overflow to change the sign bit or have a carry out.
-bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
-                                            Instruction &CxtI) {
-  // There are different heuristics we can use for this.  Here are some simple
-  // ones.
-
-  // If LHS and RHS each have at least two sign bits, the addition will look
-  // like
-  //
-  // XX..... +
-  // YY.....
-  //
-  // If the carry into the most significant position is 0, X and Y can't both
-  // be 1 and therefore the carry out of the addition is also 0.
-  //
-  // If the carry into the most significant position is 1, X and Y can't both
-  // be 0 and therefore the carry out of the addition is also 1.
-  //
-  // Since the carry into the most significant position is always equal to
-  // the carry out of the addition, there is no signed overflow.
-  if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
-      ComputeNumSignBits(RHS, 0, &CxtI) > 1)
-    return true;
-
-  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
-  KnownBits LHSKnown(BitWidth);
-  computeKnownBits(LHS, LHSKnown, 0, &CxtI);
-
-  KnownBits RHSKnown(BitWidth);
-  computeKnownBits(RHS, RHSKnown, 0, &CxtI);
-
-  // Check if carry bit of addition will not cause overflow.
-  if (checkRippleForAdd(LHSKnown, RHSKnown))
-    return true;
-
-  return false;
-}
-
 /// \brief Return true if we can prove that:
 ///    (sub LHS, RHS)  === (sub nsw LHS, RHS)
 /// This basically requires proving that the add in the original type would not
@@ -968,13 +882,9 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
 bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,
                                               Instruction &CxtI) {
   // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
-  bool LHSKnownNonNegative, LHSKnownNegative;
-  bool RHSKnownNonNegative, RHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0,
-                 &CxtI);
-  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0,
-                 &CxtI);
-  if (LHSKnownNegative && RHSKnownNonNegative)
+  KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+  KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+  if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
     return true;
 
   return false;
@@ -1041,6 +951,57 @@ static Value *checkForNegativeOperand(BinaryOperator &I,
   return nullptr;
 }
 
+static Instruction *foldAddWithConstant(BinaryOperator &Add,
+                                        InstCombiner::BuilderTy &Builder) {
+  Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
+  const APInt *C;
+  if (!match(Op1, m_APInt(C)))
+    return nullptr;
+
+  if (C->isSignMask()) {
+    // If wrapping is not allowed, then the addition must set the sign bit:
+    // X + (signmask) --> X | signmask
+    if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap())
+      return BinaryOperator::CreateOr(Op0, Op1);
+
+    // If wrapping is allowed, then the addition flips the sign bit of LHS:
+    // X + (signmask) --> X ^ signmask
+    return BinaryOperator::CreateXor(Op0, Op1);
+  }
+
+  Value *X;
+  const APInt *C2;
+  Type *Ty = Add.getType();
+
+  // Is this add the last step in a convoluted sext?
+  // add(zext(xor i16 X, -32768), -32768) --> sext X
+  if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) &&
+      C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
+    return CastInst::Create(Instruction::SExt, X, Ty);
+
+  // (add (zext (add nuw X, C2)), C) --> (zext (add nuw X, C2 + C))
+  // FIXME: This should check hasOneUse to not increase the instruction count?
+  if (C->isNegative() &&
+      match(Op0, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2)))) &&
+      C->sge(-C2->sext(C->getBitWidth()))) {
+    Constant *NewC =
+        ConstantInt::get(X->getType(), *C2 + C->trunc(C2->getBitWidth()));
+    return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
+  }
+
+  // Shifts and add used to flip and mask off the low bit:
+  // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
+  const APInt *C3;
+  if (*C == 1 && match(Op0, m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C2)),
+                                            m_APInt(C3)))) &&
+      C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
+    Value *NotX = Builder.CreateNot(X);
+    return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
@@ -1056,41 +1017,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
 
-  const APInt *RHSC;
-  if (match(RHS, m_APInt(RHSC))) {
-    if (RHSC->isSignMask()) {
-      // If wrapping is not allowed, then the addition must set the sign bit:
-      // X + (signmask) --> X | signmask
-      if (I.hasNoSignedWrap() || I.hasNoUnsignedWrap())
-        return BinaryOperator::CreateOr(LHS, RHS);
-
-      // If wrapping is allowed, then the addition flips the sign bit of LHS:
-      // X + (signmask) --> X ^ signmask
-      return BinaryOperator::CreateXor(LHS, RHS);
-    }
-
-    // Is this add the last step in a convoluted sext?
-    Value *X;
-    const APInt *C;
-    if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) &&
-        C->isMinSignedValue() &&
-        C->sext(LHS->getType()->getScalarSizeInBits()) == *RHSC) {
-      // add(zext(xor i16 X, -32768), -32768) --> sext X
-      return CastInst::Create(Instruction::SExt, X, LHS->getType());
-    }
-
-    if (RHSC->isNegative() &&
-        match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) &&
-        RHSC->sge(-C->sext(RHSC->getBitWidth()))) {
-      // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val))
-      Constant *NewC =
-          ConstantInt::get(X->getType(), *C + RHSC->trunc(C->getBitWidth()));
-      return new ZExtInst(Builder->CreateNUWAdd(X, NewC), I.getType());
-    }
-  }
+  if (Instruction *X = foldAddWithConstant(I, *Builder))
+    return X;
 
-  // FIXME: Use the match above instead of dyn_cast to allow these transforms
-  // for splat vectors.
+  // FIXME: This should be moved into the above helper function to allow these
+  // transforms for splat vectors.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
     // zext(bool) + C -> bool ? C + 1 : C
     if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
@@ -1285,8 +1216,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         Constant *CI =
             ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
         if (ConstantExpr::getZExt(CI, I.getType()) == RHSC &&
-            computeOverflowForUnsignedAdd(LHSConv->getOperand(0), CI, &I) ==
-                OverflowResult::NeverOverflows) {
+            willNotOverflowUnsignedAdd(LHSConv->getOperand(0), CI, I)) {
           // Insert the new, smaller add.
           Value *NewAdd =
               Builder->CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv");
@@ -1303,9 +1233,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       if (LHSConv->getOperand(0)->getType() ==
               RHSConv->getOperand(0)->getType() &&
           (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
-          computeOverflowForUnsignedAdd(LHSConv->getOperand(0),
-                                        RHSConv->getOperand(0),
-                                        &I) == OverflowResult::NeverOverflows) {
+          willNotOverflowUnsignedAdd(LHSConv->getOperand(0),
+                                     RHSConv->getOperand(0), I)) {
         // Insert the new integer add.
         Value *NewAdd = Builder->CreateNUWAdd(
             LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv");
@@ -1347,15 +1276,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   }
 
   // TODO(jingyue): Consider WillNotOverflowSignedAdd and
-  // WillNotOverflowUnsignedAdd to reduce the number of invocations of
+  // willNotOverflowUnsignedAdd to reduce the number of invocations of
   // computeKnownBits.
   if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
-  if (!I.hasNoUnsignedWrap() &&
-      computeOverflowForUnsignedAdd(LHS, RHS, &I) ==
-          OverflowResult::NeverOverflows) {
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b114801cc1c0..82dc88f1b3ad 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -23,21 +23,6 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
-static inline Value *dyn_castNotVal(Value *V) {
-  // If this is not(not(x)) don't return that this is a not: we want the two
-  // not's to be folded first.
-  if (BinaryOperator::isNot(V)) {
-    Value *Operand = BinaryOperator::getNotArgument(V);
-    if (!IsFreeToInvert(Operand, Operand->hasOneUse()))
-      return Operand;
-  }
-
-  // Constants can be considered to be not'ed values...
-  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
-    return ConstantInt::get(C->getType(), ~C->getValue());
-  return nullptr;
-}
-
 /// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
 /// a four bit mask.
 static unsigned getFCmpCode(FCmpInst::Predicate CC) {
@@ -713,9 +698,8 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
   }
 
   // This simplification is only valid if the upper range is not negative.
-  bool IsNegative, IsNotNegative;
-  ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1);
-  if (!IsNotNegative)
+  KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1);
+  if (!Known.isNonNegative())
     return nullptr;
 
   if (Inverted)
@@ -1013,26 +997,22 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
 /// (~A & ~B) == (~(A | B))
 /// (~A | ~B) == (~(A & B))
 static Instruction *matchDeMorgansLaws(BinaryOperator &I,
-                                       InstCombiner::BuilderTy *Builder) {
+                                       InstCombiner::BuilderTy &Builder) {
   auto Opcode = I.getOpcode();
   assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
          "Trying to match De Morgan's Laws with something other than and/or");
+
   // Flip the logic operation.
-  if (Opcode == Instruction::And)
-    Opcode = Instruction::Or;
-  else
-    Opcode = Instruction::And;
+  Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
 
-  Value *Op0 = I.getOperand(0);
-  Value *Op1 = I.getOperand(1);
-  // TODO: Use pattern matchers instead of dyn_cast.
-  if (Value *Op0NotVal = dyn_castNotVal(Op0))
-    if (Value *Op1NotVal = dyn_castNotVal(Op1))
-      if (Op0->hasOneUse() && Op1->hasOneUse()) {
-        Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal,
-                                              I.getName() + ".demorgan");
-        return BinaryOperator::CreateNot(LogicOp);
-      }
+  Value *A, *B;
+  if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
+      match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
+      !IsFreeToInvert(A, A->hasOneUse()) &&
+      !IsFreeToInvert(B, B->hasOneUse())) {
+    Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
+    return BinaryOperator::CreateNot(AndOr);
+  }
 
   return nullptr;
 }
@@ -1376,7 +1356,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
 
-  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))
     return DeMorgan;
 
   {
@@ -2005,18 +1985,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (Value *V = SimplifyBSwap(I))
     return replaceInstUsesWith(I, V);
 
-  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
-    ConstantInt *C1 = nullptr; Value *X = nullptr;
-    // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
-    if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) &&
-        Op0->hasOneUse()) {
-      Value *Or = Builder->CreateOr(X, RHS);
-      Or->takeName(Op0);
-      return BinaryOperator::CreateXor(Or,
-                            Builder->getInt(C1->getValue() & ~RHS->getValue()));
-    }
-  }
-
   if (isa<Constant>(Op1))
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
@@ -2167,7 +2135,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
     return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C));
 
-  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+  if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))
     return DeMorgan;
 
   // Canonicalize xor to the RHS.
@@ -2399,27 +2367,44 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   }
 
   // Is this a 'not' (~) fed by a binary operator?
-  BinaryOperator *NotOp;
-  if (match(&I, m_Not(m_BinOp(NotOp)))) {
-    if (NotOp->getOpcode() == Instruction::And ||
-        NotOp->getOpcode() == Instruction::Or) {
+  BinaryOperator *NotVal;
+  if (match(&I, m_Not(m_BinOp(NotVal)))) {
+    if (NotVal->getOpcode() == Instruction::And ||
+        NotVal->getOpcode() == Instruction::Or) {
       // Apply DeMorgan's Law when inverts are free:
       // ~(X & Y) --> (~X | ~Y)
       // ~(X | Y) --> (~X & ~Y)
-      if (IsFreeToInvert(NotOp->getOperand(0),
-                         NotOp->getOperand(0)->hasOneUse()) &&
-          IsFreeToInvert(NotOp->getOperand(1),
-                         NotOp->getOperand(1)->hasOneUse())) {
-        Value *NotX = Builder->CreateNot(NotOp->getOperand(0), "notlhs");
-        Value *NotY = Builder->CreateNot(NotOp->getOperand(1), "notrhs");
-        if (NotOp->getOpcode() == Instruction::And)
+      if (IsFreeToInvert(NotVal->getOperand(0),
+                         NotVal->getOperand(0)->hasOneUse()) &&
+          IsFreeToInvert(NotVal->getOperand(1),
+                         NotVal->getOperand(1)->hasOneUse())) {
+        Value *NotX = Builder->CreateNot(NotVal->getOperand(0), "notlhs");
+        Value *NotY = Builder->CreateNot(NotVal->getOperand(1), "notrhs");
+        if (NotVal->getOpcode() == Instruction::And)
           return BinaryOperator::CreateOr(NotX, NotY);
         return BinaryOperator::CreateAnd(NotX, NotY);
       }
-    } else if (NotOp->getOpcode() == Instruction::AShr) {
-      // ~(~X >>s Y) --> (X >>s Y)
-      if (Value *Op0NotVal = dyn_castNotVal(NotOp->getOperand(0)))
-        return BinaryOperator::CreateAShr(Op0NotVal, NotOp->getOperand(1));
+    }
+
+    // ~(~X >>s Y) --> (X >>s Y)
+    if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
+      return BinaryOperator::CreateAShr(X, Y);
+
+    // If we are inverting a right-shifted constant, we may be able to eliminate
+    // the 'not' by inverting the constant and using the opposite shift type.
+    // Canonicalization rules ensure that only a negative constant uses 'ashr',
+    // but we must check that in case that transform has not fired yet.
+    const APInt *C;
+    if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) {
+      // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
+      Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+      return BinaryOperator::CreateLShr(NotC, Y);
+    }
+
+    if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) {
+      // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
+      Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+      return BinaryOperator::CreateAShr(NotC, Y);
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 6989d67f0060..face7abcc95f 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1384,10 +1384,10 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
 
   // Create a mask for bits above (ctlz) or below (cttz) the first known one.
   bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
-  unsigned PossibleZeros = IsTZ ? Known.One.countTrailingZeros()
-                                : Known.One.countLeadingZeros();
-  unsigned DefiniteZeros = IsTZ ? Known.Zero.countTrailingOnes()
-                                : Known.Zero.countLeadingOnes();
+  unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
+                                : Known.countMaxLeadingZeros();
+  unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
+                                : Known.countMinLeadingZeros();
 
   // If all bits above (ctlz) or below (cttz) the first known one are known
   // zero, this value is constant.
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 312d9baae43a..001a4bcf16f3 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -559,6 +559,9 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
   }
 
+  // FIXME: Maybe combine the next two transforms to handle the no cast case
+  // more efficiently. Support vector types. Cleanup code by using m_OneUse.
+
   // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
   Value *A = nullptr; ConstantInt *Cst = nullptr;
   if (Src->hasOneUse() &&
@@ -588,15 +591,20 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // the sign bit of the original value; performing ashr instead of lshr
   // generates bits of the same value as the sign bit.
   if (Src->hasOneUse() &&
-      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) &&
-      cast<Instruction>(Src)->getOperand(0)->hasOneUse()) {
+      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) {
+    Value *SExt = cast<Instruction>(Src)->getOperand(0);
+    const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits();
     const unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+    unsigned ShiftAmt = Cst->getZExtValue();
     // This optimization can be only performed when zero bits generated by
     // the original lshr aren't pulled into the value after truncation, so we
-    // can only shift by values smaller than the size of destination type (in
-    // bits).
-    if (Cst->getValue().ult(ASize)) {
-      Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue());
+    // can only shift by values no larger than the number of extension bits.
+    // FIXME: Instead of bailing when the shift is too large, use and to clear
+    // the extra bits.
+    if (SExt->hasOneUse() && ShiftAmt <= SExtSize - ASize) {
+      // If shifting by the size of the original value in bits or more, it is
+      // being filled with the sign bit, so shift by ASize-1 to avoid ub.
+      Value *Shift = Builder->CreateAShr(A, std::min(ShiftAmt, ASize-1));
       Shift->takeName(Src);
       return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
     }
@@ -1180,9 +1188,8 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
 
   // If we know that the value being extended is positive, we can use a zext
   // instead.
-  bool KnownZero, KnownOne;
-  ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI);
-  if (KnownZero) {
+  KnownBits Known = computeKnownBits(Src, 0, &CI);
+  if (Known.isNonNegative()) {
     Value *ZExt = Builder->CreateZExt(Src, DestTy);
     return replaceInstUsesWith(CI, ZExt);
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 34ce235b3fe2..60ed4057cedd 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2785,6 +2785,9 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
 }
 
 /// Try to fold icmp (binop), X or icmp X, (binop).
+/// TODO: A large part of this logic is duplicated in InstSimplify's
+/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
+/// duplication.
 Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
@@ -2794,7 +2797,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   if (!BO0 && !BO1)
     return nullptr;
 
-  CmpInst::Predicate Pred = I.getPredicate();
+  const CmpInst::Predicate Pred = I.getPredicate();
   bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
   if (BO0 && isa<OverflowingBinaryOperator>(BO0))
     NoOp0WrapProblem =
@@ -3029,21 +3032,20 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     case Instruction::Sub:
     case Instruction::Xor:
       if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
-        return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
-                            BO1->getOperand(0));
+        return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
       // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
       if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
         if (CI->getValue().isSignMask()) {
-          ICmpInst::Predicate Pred =
+          ICmpInst::Predicate NewPred =
               I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
-          return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
         }
 
         if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
-          ICmpInst::Predicate Pred =
+          ICmpInst::Predicate NewPred =
               I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
-          Pred = I.getSwappedPredicate(Pred);
-          return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+          NewPred = I.getSwappedPredicate(NewPred);
+          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
         }
       }
       break;
@@ -3062,21 +3064,27 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
                                    AP.getBitWidth() - AP.countTrailingZeros()));
           Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask);
           Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask);
-          return new ICmpInst(I.getPredicate(), And1, And2);
+          return new ICmpInst(Pred, And1, And2);
         }
       }
       break;
+
     case Instruction::UDiv:
     case Instruction::LShr:
-      if (I.isSigned())
+      if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
         break;
-      LLVM_FALLTHROUGH;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
     case Instruction::SDiv:
+      if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
+        break;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
     case Instruction::AShr:
       if (!BO0->isExact() || !BO1->isExact())
         break;
-      return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
-                          BO1->getOperand(0));
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
     case Instruction::Shl: {
       bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
       bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
@@ -3084,8 +3092,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
         break;
       if (!NSW && I.isSigned())
         break;
-      return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
-                          BO1->getOperand(0));
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
     }
     }
   }
@@ -3096,7 +3103,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     auto BitwiseAnd =
         m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value()));
 
-    if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) {
+    if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
       auto *Zero = Constant::getNullValue(BO0->getType());
       return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
     }
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 3be6419a129a..1424f61fe701 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -30,6 +30,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
 #include "llvm/Transforms/Utils/Local.h"
 
@@ -388,10 +389,21 @@ private:
                                  bool DoTransform = true);
 
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
-  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI);
+  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+    return computeOverflowForSignedAdd(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
+  bool willNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+    return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
   bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
   bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
   bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI);
+  bool willNotOverflowUnsignedMul(Value *LHS, Value *RHS, Instruction &CxtI) {
+    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
   Value *EmitGEPOffset(User *GEP);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
@@ -492,7 +504,11 @@ public:
 
   void computeKnownBits(Value *V, KnownBits &Known,
                         unsigned Depth, Instruction *CxtI) const {
-    return llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+    llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+  }
+  KnownBits computeKnownBits(Value *V, unsigned Depth,
+                             Instruction *CxtI) const {
+    return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
   }
 
   bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0,
@@ -503,11 +519,6 @@ public:
                               Instruction *CxtI = nullptr) const {
     return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
   }
-  void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
-                      unsigned Depth = 0, Instruction *CxtI = nullptr) const {
-    return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI,
-                                &DT);
-  }
   OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS,
                                                const Instruction *CxtI) {
     return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
@@ -516,6 +527,11 @@ public:
                                                const Instruction *CxtI) {
     return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
   }
+  OverflowResult computeOverflowForSignedAdd(const Value *LHS,
+                                             const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
 
   /// Maximum size of array considered when transforming.
   uint64_t MaxArraySizeForCombine;
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 675553017838..a4d84ae81aa0 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -885,10 +885,8 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
   // first non-zero index.
   auto IsAllNonNegative = [&]() {
     for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
-      bool KnownNonNegative, KnownNegative;
-      IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative,
-                        KnownNegative, 0, MemI);
-      if (KnownNonNegative)
+      KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
+      if (Known.isNonNegative())
         continue;
       return false;
     }
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index face9d9237ae..2a35259f2103 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -162,11 +162,9 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
     // product is exactly the minimum negative number.
     // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
     // For simplicity we just check if at least one side is not negative.
-    bool LHSNonNegative, LHSNegative;
-    bool RHSNonNegative, RHSNegative;
-    ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, &CxtI);
-    ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, &CxtI);
-    if (LHSNonNegative || RHSNonNegative)
+    KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+    KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+    if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())
       return true;
   }
   return false;
@@ -422,8 +420,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
         Constant *CI =
             ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
         if (ConstantExpr::getZExt(CI, I.getType()) == Op1C &&
-            computeOverflowForUnsignedMul(Op0Conv->getOperand(0), CI, &I) ==
-                OverflowResult::NeverOverflows) {
+            willNotOverflowUnsignedMul(Op0Conv->getOperand(0), CI, I)) {
           // Insert the new, smaller mul.
           Value *NewMul =
               Builder->CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv");
@@ -440,9 +437,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       if (Op0Conv->getOperand(0)->getType() ==
               Op1Conv->getOperand(0)->getType() &&
           (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
-          computeOverflowForUnsignedMul(Op0Conv->getOperand(0),
-                                        Op1Conv->getOperand(0),
-                                        &I) == OverflowResult::NeverOverflows) {
+          willNotOverflowUnsignedMul(Op0Conv->getOperand(0),
+                                     Op1Conv->getOperand(0), I)) {
         // Insert the new integer mul.
         Value *NewMul = Builder->CreateNUWMul(
             Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv");
@@ -456,9 +452,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     I.setHasNoSignedWrap(true);
   }
 
-  if (!I.hasNoUnsignedWrap() &&
-      computeOverflowForUnsignedMul(Op0, Op1, &I) ==
-          OverflowResult::NeverOverflows) {
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 05b01774cd5e..4028a92771a4 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -611,7 +611,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
       return I;
 
-    unsigned Leaders = Known2.Zero.countLeadingOnes();
+    unsigned Leaders = Known2.countMinLeadingZeros();
     Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
     break;
   }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1792cb585f87..65b1148cb03b 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2212,9 +2212,9 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
 
   // Canonicalize fcmp_one -> fcmp_oeq
   FCmpInst::Predicate FPred; Value *Y;
-  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)),
-                             TrueDest, FalseDest)) &&
-      BI.getCondition()->hasOneUse())
+  if (match(&BI, m_Br(m_OneUse(m_FCmp(FPred, m_Value(X), m_Value(Y))),
+                      TrueDest, FalseDest))) {
+    // TODO: Why are we only transforming these 3 predicates?
     if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
         FPred == FCmpInst::FCMP_OGE) {
       FCmpInst *Cond = cast<FCmpInst>(BI.getCondition());
@@ -2225,12 +2225,12 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
       Worklist.Add(Cond);
       return &BI;
     }
+  }
 
   // Canonicalize icmp_ne -> icmp_eq
   ICmpInst::Predicate IPred;
-  if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)),
-                      TrueDest, FalseDest)) &&
-      BI.getCondition()->hasOneUse())
+  if (match(&BI, m_Br(m_OneUse(m_ICmp(IPred, m_Value(X), m_Value(Y))),
+                      TrueDest, FalseDest))) {
     if (IPred == ICmpInst::ICMP_NE  || IPred == ICmpInst::ICMP_ULE ||
         IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||
         IPred == ICmpInst::ICMP_SGE) {
@@ -2241,6 +2241,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
       Worklist.Add(Cond);
       return &BI;
     }
+  }
 
   return nullptr;
 }
@@ -2264,8 +2265,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth();
   KnownBits Known(BitWidth);
   computeKnownBits(Cond, Known, 0, &SI);
-  unsigned LeadingKnownZeros = Known.Zero.countLeadingOnes();
-  unsigned LeadingKnownOnes = Known.One.countLeadingOnes();
+  unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
+  unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
 
   // Compute the number of leading bits we can ignore.
   // TODO: A better way to determine this would use ComputeNumSignBits().
@@ -3141,7 +3142,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
 
   // Lower dbg.declare intrinsics otherwise their value may be clobbered
   // by instcombiner.
-  bool DbgDeclaresChanged = LowerDbgDeclare(F);
+  bool MadeIRChange = LowerDbgDeclare(F);
 
   // Iterate while there is work to do.
   int Iteration = 0;
@@ -3150,18 +3151,17 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
     DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                  << F.getName() << "\n");
 
-    bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
+    MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
 
     InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines,
                     AA, AC, TLI, DT, DL, LI);
     IC.MaxArraySizeForCombine = MaxArraySize;
-    Changed |= IC.run();
 
-    if (!Changed)
+    if (!IC.run())
       break;
   }
 
-  return DbgDeclaresChanged || Iteration > 1;
+  return MadeIRChange || Iteration > 1;
 }
 
 PreservedAnalyses InstCombinePass::run(Function &F,
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b034ccc46933..7eea44d6aca0 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -613,7 +613,15 @@ public:
                                   bool UseGlobalsGC = true)
       : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan),
         Recover(Recover || ClRecover),
-        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC) {}
+        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+        // Not a typo: ClWithComdat is almost completely pointless without
+        // ClUseGlobalsGC (because then it only works on modules without
+        // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
+        // and both suffer from gold PR19002 for which UseGlobalsGC constructor
+        // argument is designed as workaround. Therefore, disable both
+        // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
+        // do globals-gc.
+        UseCtorComdat(UseGlobalsGC && ClWithComdat) {}
   bool runOnModule(Module &M) override;
   static char ID; // Pass identification, replacement for typeid
   StringRef getPassName() const override { return "AddressSanitizerModule"; }
@@ -656,6 +664,7 @@ private:
   bool CompileKernel;
   bool Recover;
   bool UseGlobalsGC;
+  bool UseCtorComdat;
   Type *IntptrTy;
   LLVMContext *C;
   Triple TargetTriple;
@@ -1677,7 +1686,7 @@ AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,
                      : GlobalVariable::PrivateLinkage;
   GlobalVariable *Metadata = new GlobalVariable(
       M, Initializer->getType(), false, Linkage, Initializer,
-      Twine("__asan_global_") + GlobalValue::getRealLinkageName(OriginalName));
+      Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
   Metadata->setSection(getGlobalMetadataSection());
   return Metadata;
 }
@@ -1782,7 +1791,7 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
   // On recent Mach-O platforms, use a structure which binds the liveness of
   // the global variable to the metadata struct. Keep the list of "Liveness" GV
   // created to be added to llvm.compiler.used
-  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr);
+  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
   SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
 
   for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
@@ -1793,9 +1802,9 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
 
     // On recent Mach-O platforms, we emit the global metadata in a way that
     // allows the linker to properly strip dead globals.
-    auto LivenessBinder = ConstantStruct::get(
-        LivenessTy, Initializer->getAggregateElement(0u),
-        ConstantExpr::getPointerCast(Metadata, IntptrTy), nullptr);
+    auto LivenessBinder =
+        ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
+                            ConstantExpr::getPointerCast(Metadata, IntptrTy));
     GlobalVariable *Liveness = new GlobalVariable(
         M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
         Twine("__asan_binder_") + G->getName());
@@ -1893,7 +1902,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
   // We initialize an array of such structures and pass it to a run-time call.
   StructType *GlobalStructTy =
       StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
-                      IntptrTy, IntptrTy, IntptrTy, nullptr);
+                      IntptrTy, IntptrTy, IntptrTy);
   SmallVector<GlobalVariable *, 16> NewGlobals(n);
   SmallVector<Constant *, 16> Initializers(n);
 
@@ -1929,10 +1938,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
     assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
 
-    StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr);
-    Constant *NewInitializer =
-        ConstantStruct::get(NewTy, G->getInitializer(),
-                            Constant::getNullValue(RightRedZoneTy), nullptr);
+    StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
+    Constant *NewInitializer = ConstantStruct::get(
+        NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));
 
     // Create a new global variable with enough space for a redzone.
     GlobalValue::LinkageTypes Linkage = G->getLinkage();
@@ -2013,7 +2021,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
         ConstantExpr::getPointerCast(Name, IntptrTy),
         ConstantExpr::getPointerCast(ModuleName, IntptrTy),
         ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
-        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy), nullptr);
+        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
 
     if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
 
@@ -2073,7 +2081,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   // Put the constructor and destructor in comdat if both
   // (1) global instrumentation is not TU-specific
   // (2) target is ELF.
-  if (ClWithComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
+  if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
     AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
     appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority,
                         AsanCtorFunction);
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 8786781933ea..e2e3cbdbc295 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -388,7 +388,7 @@ FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
     ArgTypes.push_back(ShadowPtrTy);
   Type *RetType = T->getReturnType();
   if (!RetType->isVoidTy())
-    RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr);
+    RetType = StructType::get(RetType, ShadowTy);
   return FunctionType::get(RetType, ArgTypes, T->isVarArg());
 }
 
@@ -476,16 +476,14 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
     GetArgTLS = ConstantExpr::getIntToPtr(
         ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
         PointerType::getUnqual(
-            FunctionType::get(PointerType::getUnqual(ArgTLSTy),
-                              (Type *)nullptr)));
+            FunctionType::get(PointerType::getUnqual(ArgTLSTy), false)));
   }
   if (GetRetvalTLSPtr) {
     RetvalTLS = nullptr;
     GetRetvalTLS = ConstantExpr::getIntToPtr(
         ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
         PointerType::getUnqual(
-            FunctionType::get(PointerType::getUnqual(ShadowTy),
-                              (Type *)nullptr)));
+            FunctionType::get(PointerType::getUnqual(ShadowTy), false)));
   }
 
   ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 7dea1dee756a..e89384c559fe 100644
--- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -398,8 +398,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
   //   u64 *ArrayCounter;
   // };
   auto *StructInfoTy =
-    StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
-                    Int8PtrPtrTy, Int64PtrTy, Int64PtrTy, nullptr);
+      StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
+                      Int8PtrPtrTy, Int64PtrTy, Int64PtrTy);
   auto *StructInfoPtrTy = StructInfoTy->getPointerTo();
   // This structure should be kept consistent with the CacheFragInfo struct
   // in the runtime library.
@@ -408,8 +408,7 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
   //   u32 NumStructs;
   //   StructInfo *Structs;
   // };
-  auto *CacheFragInfoTy =
-    StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy, nullptr);
+  auto *CacheFragInfoTy = StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy);
 
   std::vector<StructType *> Vec = M.getIdentifiedStructTypes();
   unsigned NumStructs = 0;
@@ -457,24 +456,23 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
     ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
     ArrayCounterIdx[1] = ConstantInt::get(Int32Ty,
                                           getArrayCounterIdx(StructTy));
-    Initializers.push_back(
-        ConstantStruct::get(
-            StructInfoTy,
-            ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
-            ConstantInt::get(Int32Ty,
-                             DL.getStructLayout(StructTy)->getSizeInBytes()),
-            ConstantInt::get(Int32Ty, StructTy->getNumElements()),
-            Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
-                ConstantExpr::getPointerCast(Offset, Int32PtrTy),
-            Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
-                ConstantExpr::getPointerCast(Size, Int32PtrTy),
-            TypeName == nullptr ? ConstantPointerNull::get(Int8PtrPtrTy) :
-                ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
-            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
-                                           FieldCounterIdx),
-            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
-                                           ArrayCounterIdx),
-            nullptr));
+    Initializers.push_back(ConstantStruct::get(
+        StructInfoTy,
+        ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
+        ConstantInt::get(Int32Ty,
+                         DL.getStructLayout(StructTy)->getSizeInBytes()),
+        ConstantInt::get(Int32Ty, StructTy->getNumElements()),
+        Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+                          : ConstantExpr::getPointerCast(Offset, Int32PtrTy),
+        Size == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+                        : ConstantExpr::getPointerCast(Size, Int32PtrTy),
+        TypeName == nullptr
+            ? ConstantPointerNull::get(Int8PtrPtrTy)
+            : ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
+        ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+                                       FieldCounterIdx),
+        ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+                                       ArrayCounterIdx)));
   }
   // Structs.
   Constant *StructInfo;
@@ -491,11 +489,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
 
   auto *CacheFragInfoGV = new GlobalVariable(
       M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage,
-      ConstantStruct::get(CacheFragInfoTy,
-                          UnitName,
-                          ConstantInt::get(Int32Ty, NumStructs),
-                          StructInfo,
-                          nullptr));
+      ConstantStruct::get(CacheFragInfoTy, UnitName,
+                          ConstantInt::get(Int32Ty, NumStructs), StructInfo));
   return CacheFragInfoGV;
 }
 
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 15333a5317dd..ff753c20a94a 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1576,13 +1576,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
                           bool Signed = false) {
     Type *srcTy = V->getType();
+    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+    if (srcSizeInBits > 1 && dstSizeInBits == 1)
+      return IRB.CreateICmpNE(V, getCleanShadow(V));
+
     if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
       return IRB.CreateIntCast(V, dstTy, Signed);
     if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
         dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
       return IRB.CreateIntCast(V, dstTy, Signed);
-    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
-    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
     Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
     Value *V2 =
       IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 3f1a77b49a44..ee493a8ec7e1 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -442,9 +442,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
 
   bool Changed = false;
   if (!NUW) {
-    ConstantRange NUWRange =
-            LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
-                                              OBO::NoUnsignedWrap);
+    ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+        BinaryOperator::Add, LRange, OBO::NoUnsignedWrap);
     if (!NUWRange.isEmptySet()) {
       bool NewNUW = NUWRange.contains(LazyRRange());
       AddOp->setHasNoUnsignedWrap(NewNUW);
@@ -452,9 +451,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
     }
   }
   if (!NSW) {
-    ConstantRange NSWRange =
-            LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
-                                              OBO::NoSignedWrap);
+    ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+        BinaryOperator::Add, LRange, OBO::NoSignedWrap);
     if (!NSWRange.isEmptySet()) {
       bool NewNSW = NSWRange.contains(LazyRRange());
       AddOp->setHasNoSignedWrap(NewNSW);
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 48d5ae88cda9..6693a26e8890 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -144,6 +144,10 @@ private:
   bool recognizePopcount();
   void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
                                PHINode *CntPhi, Value *Var);
+  bool recognizeAndInsertCTLZ();
+  void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
+                                PHINode *CntPhi, Value *Var, const DebugLoc DL,
+                                bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
 
   /// @}
 };
@@ -994,7 +998,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
 }
 
 bool LoopIdiomRecognize::runOnNoncountableLoop() {
-  return recognizePopcount();
+  return recognizePopcount() || recognizeAndInsertCTLZ();
 }
 
 /// Check if the given conditional branch is based on the comparison between
@@ -1159,6 +1163,167 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
   return true;
 }
 
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+///       or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+///       or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 == 0)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val;
+///    do {
+///       x = phi (x0, x.next);   //PhiX
+///       cnt = phi(cnt0, cnt.next);
+///
+///       cnt.next = cnt + 1;
+///        ...
+///       x.next = x >> 1;   // DefX
+///        ...
+///    } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
+                            Instruction *&CntInst, PHINode *&CntPhi,
+                            Instruction *&DefX) {
+  BasicBlock *LoopEntry;
+  Value *VarX = nullptr;
+
+  DefX = nullptr;
+  PhiX = nullptr;
+  CntInst = nullptr;
+  CntPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  if (Value *T = matchCondition(
+          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+    DefX = dyn_cast<Instruction>(T);
+  else
+    return false;
+
+  // step 2: detect instructions corresponding to "x.next = x >> 1"
+  if (!DefX || DefX->getOpcode() != Instruction::AShr)
+    return false;
+  if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
+    if (!Shft || !Shft->isOne())
+      return false;
+  VarX = DefX->getOperand(0);
+
+  // step 3: Check the recurrence of variable X
+  PhiX = dyn_cast<PHINode>(VarX);
+  if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX))
+    return false;
+
+  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+  // TODO: We can skip the step. If loop trip count is known (CTLZ),
+  //       then all uses of "cnt.next" could be optimized to the trip count
+  //       plus "cnt0". Currently it is not optimized.
+  //       This step could be used to detect POPCNT instruction:
+  //       cnt.next = cnt + (x.next & 1)
+  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+                            IterE = LoopEntry->end();
+       Iter != IterE; Iter++) {
+    Instruction *Inst = &*Iter;
+    if (Inst->getOpcode() != Instruction::Add)
+      continue;
+
+    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+    if (!Inc || !Inc->isOne())
+      continue;
+
+    PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+    if (!Phi || Phi->getParent() != LoopEntry)
+      continue;
+
+    CntInst = Inst;
+    CntPhi = Phi;
+    break;
+  }
+  if (!CntInst)
+    return false;
+
+  return true;
+}
+
+/// Recognize CTLZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ trip count).
+/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  Instruction *CntInst, *DefX;
+  PHINode *CntPhi, *PhiX;
+  if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
+    return false;
+
+  bool IsCntPhiUsedOutsideLoop = false;
+  for (User *U : CntPhi->users())
+    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+      IsCntPhiUsedOutsideLoop = true;
+      break;
+    }
+  bool IsCntInstUsedOutsideLoop = false;
+  for (User *U : CntInst->users())
+    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+      IsCntInstUsedOutsideLoop = true;
+      break;
+    }
+  // If both CntInst and CntPhi are used outside the loop the profitability
+  // is questionable.
+  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+    return false;
+
+  // For some CPUs result of CTLZ(X) intrinsic is undefined
+  // when X is 0. If we can not guarantee X != 0, we need to check this
+  // when expand.
+  bool ZeroCheck = false;
+  // It is safe to assume Preheader exist as it was checked in
+  // parent function RunOnLoop.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  Value *InitX = PhiX->getIncomingValueForBlock(PH);
+  // If we check X != 0 before entering the loop we don't need a zero
+  // check in CTLZ intrinsic.
+  if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
+    if (BranchInst *PreCondBr =
+        dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
+      if (matchCondition(PreCondBr, PH) == InitX)
+        ZeroCheck = true;
+    }
+
+  // Check if CTLZ intrinsic is profitable. Assume it is always profitable
+  // if we delete the loop (the loop has only 6 instructions):
+  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+  //  %shr = ashr %n.addr.0, 1
+  //  %tobool = icmp eq %shr, 0
+  //  %inc = add nsw %i.0, 1
+  //  br i1 %tobool
+
+  IRBuilder<> Builder(PH->getTerminator());
+  SmallVector<const Value *, 2> Ops =
+      {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
+  ArrayRef<const Value *> Args(Ops);
+  if (CurLoop->getHeader()->size() != 6 &&
+      TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
+          TargetTransformInfo::TCC_Basic)
+    return false;
+
+  const DebugLoc DL = DefX->getDebugLoc();
+  transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+                           IsCntPhiUsedOutsideLoop);
+  return true;
+}
+
 /// Recognizes a population count idiom in a non-countable loop.
 ///
 /// If detected, transforms the relevant code to issue the popcount intrinsic
@@ -1222,6 +1387,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
   return CI;
 }
 
+static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+                                     const DebugLoc &DL, bool ZeroCheck) {
+  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+  Type *Tys[] = {Val->getType()};
+
+  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+/// Transform the following loop:
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+//    LOOP_BODY
+///   Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+///   CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+///   Count = CountPrev + 1
+/// else
+///   Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   PhiCount = PHI [Count, Dec]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+///   Dec = PhiCount - 1
+///   LOOP_BODY
+///   Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+    BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
+    const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+  BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+
+  // Step 1: Insert the CTLZ instruction at the end of the preheader block
+  //   Count = BitWidth - CTLZ(InitX);
+  // If there are uses of CntPhi create:
+  //   CountPrev = BitWidth - CTLZ(InitX >> 1);
+  IRBuilder<> Builder(PreheaderBr);
+  Builder.SetCurrentDebugLocation(DL);
+  Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
+
+  if (IsCntPhiUsedOutsideLoop)
+    InitXNext = Builder.CreateAShr(InitX,
+                                   ConstantInt::get(InitX->getType(), 1));
+  else
+    InitXNext = InitX;
+  CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
+  Count = Builder.CreateSub(
+      ConstantInt::get(CTLZ->getType(),
+                       CTLZ->getType()->getIntegerBitWidth()),
+      CTLZ);
+  if (IsCntPhiUsedOutsideLoop) {
+    CountPrev = Count;
+    Count = Builder.CreateAdd(
+        CountPrev,
+        ConstantInt::get(CountPrev->getType(), 1));
+  }
+  if (IsCntPhiUsedOutsideLoop)
+    NewCount = Builder.CreateZExtOrTrunc(CountPrev,
+        cast<IntegerType>(CntInst->getType()));
+  else
+    NewCount = Builder.CreateZExtOrTrunc(Count,
+        cast<IntegerType>(CntInst->getType()));
+
+  // If the CTLZ counter's initial value is not zero, insert Add Inst.
+  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
+  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+  if (!InitConst || !InitConst->isZero())
+    NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+
+  // Step 2: Insert new IV and loop condition:
+  // loop:
+  //   ...
+  //   PhiCount = PHI [Count, Dec]
+  //   ...
+  //   Dec = PhiCount - 1
+  //   ...
+  //   Br: loop if (Dec != 0)
+  BasicBlock *Body = *(CurLoop->block_begin());
+  auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+  Type *Ty = Count->getType();
+
+  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+  Builder.SetInsertPoint(LbCond);
+  Instruction *TcDec = cast<Instruction>(
+      Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+                        "tcdec", false, true));
+
+  TcPhi->addIncoming(Count, Preheader);
+  TcPhi->addIncoming(TcDec, Body);
+
+  CmpInst::Predicate Pred =
+      (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+  LbCond->setPredicate(Pred);
+  LbCond->setOperand(0, TcDec);
+  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+  // Step 3: All the references to the original counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctlz(x).
+  if (IsCntPhiUsedOutsideLoop)
+    CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+  else
+    CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+  // step 4: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
+
 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
                                                  Instruction *CntInst,
                                                  PHINode *CntPhi, Value *Var) {
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 3c9850b156ac..5e0a705782ea 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -283,7 +283,6 @@ public:
 
   // Forward propagation info
   const Expression *getDefiningExpr() const { return DefiningExpr; }
-  void setDefiningExpr(const Expression *E) { DefiningExpr = E; }
 
   // Value member set
   bool empty() const { return Members.empty(); }
@@ -317,6 +316,9 @@ public:
     --StoreCount;
   }
 
+  // True if this class has no memory members.
+  bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
+
   // Return true if two congruence classes are equivalent to each other.  This
   // means
   // that every field but the ID number and the dead field are equivalent.
@@ -401,9 +403,12 @@ class NewGVN {
   MemorySSAWalker *MSSAWalker;
   const DataLayout &DL;
   std::unique_ptr<PredicateInfo> PredInfo;
-  BumpPtrAllocator ExpressionAllocator;
-  ArrayRecycler<Value *> ArgRecycler;
-  TarjanSCC SCCFinder;
+
+  // These are the only two things the create* functions should have
+  // side-effects on due to allocating memory.
+  mutable BumpPtrAllocator ExpressionAllocator;
+  mutable ArrayRecycler<Value *> ArgRecycler;
+  mutable TarjanSCC SCCFinder;
   const SimplifyQuery SQ;
 
   // Number of function arguments, used by ranking
@@ -430,11 +435,12 @@ class NewGVN {
   // In order to correctly ensure propagation, we must keep track of what
   // comparisons we used, so that when the values of the comparisons change, we
   // propagate the information to the places we used the comparison.
-  DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> PredicateToUsers;
-  // Mapping from MemoryAccess we used to the MemoryAccess we used it with.  Has
+  mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
+      PredicateToUsers;
   // the same reasoning as PredicateToUsers.  When we skip MemoryAccesses for
   // stores, we no longer can rely solely on the def-use chains of MemorySSA.
-  DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> MemoryToUsers;
+  mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
+      MemoryToUsers;
 
   // A table storing which memorydefs/phis represent a memory state provably
   // equivalent to another memory state.
@@ -457,7 +463,7 @@ class NewGVN {
   DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
 
   enum PhiCycleState { PCS_Unknown, PCS_CycleFree, PCS_Cycle };
-  DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
+  mutable DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
   // Expression to class mapping.
   using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
   ExpressionClassMap ExpressionToClass;
@@ -511,21 +517,24 @@ public:
 
 private:
   // Expression handling.
-  const Expression *createExpression(Instruction *);
-  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *);
-  PHIExpression *createPHIExpression(Instruction *, bool &HasBackedge,
-                                     bool &AllConstant);
-  const VariableExpression *createVariableExpression(Value *);
-  const ConstantExpression *createConstantExpression(Constant *);
-  const Expression *createVariableOrConstant(Value *V);
-  const UnknownExpression *createUnknownExpression(Instruction *);
+  const Expression *createExpression(Instruction *) const;
+  const Expression *createBinaryExpression(unsigned, Type *, Value *,
+                                           Value *) const;
+  PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge,
+                                     bool &AllConstant) const;
+  const VariableExpression *createVariableExpression(Value *) const;
+  const ConstantExpression *createConstantExpression(Constant *) const;
+  const Expression *createVariableOrConstant(Value *V) const;
+  const UnknownExpression *createUnknownExpression(Instruction *) const;
   const StoreExpression *createStoreExpression(StoreInst *,
-                                               const MemoryAccess *);
+                                               const MemoryAccess *) const;
   LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
-                                       const MemoryAccess *);
-  const CallExpression *createCallExpression(CallInst *, const MemoryAccess *);
-  const AggregateValueExpression *createAggregateValueExpression(Instruction *);
-  bool setBasicExpressionInfo(Instruction *, BasicExpression *);
+                                       const MemoryAccess *) const;
+  const CallExpression *createCallExpression(CallInst *,
+                                             const MemoryAccess *) const;
+  const AggregateValueExpression *
+  createAggregateValueExpression(Instruction *) const;
+  bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
 
   // Congruence class handling.
   CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
@@ -560,17 +569,18 @@ private:
 
   // Symbolic evaluation.
   const Expression *checkSimplificationResults(Expression *, Instruction *,
-                                               Value *);
-  const Expression *performSymbolicEvaluation(Value *);
+                                               Value *) const;
+  const Expression *performSymbolicEvaluation(Value *) const;
   const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
-                                                Instruction *, MemoryAccess *);
-  const Expression *performSymbolicLoadEvaluation(Instruction *);
-  const Expression *performSymbolicStoreEvaluation(Instruction *);
-  const Expression *performSymbolicCallEvaluation(Instruction *);
-  const Expression *performSymbolicPHIEvaluation(Instruction *);
-  const Expression *performSymbolicAggrValueEvaluation(Instruction *);
-  const Expression *performSymbolicCmpEvaluation(Instruction *);
-  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *);
+                                                Instruction *,
+                                                MemoryAccess *) const;
+  const Expression *performSymbolicLoadEvaluation(Instruction *) const;
+  const Expression *performSymbolicStoreEvaluation(Instruction *) const;
+  const Expression *performSymbolicCallEvaluation(Instruction *) const;
+  const Expression *performSymbolicPHIEvaluation(Instruction *) const;
+  const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
+  const Expression *performSymbolicCmpEvaluation(Instruction *) const;
+  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
 
   // Congruence finding.
   bool someEquivalentDominates(const Instruction *, const Instruction *) const;
@@ -620,8 +630,8 @@ private:
   void markPredicateUsersTouched(Instruction *);
   void markValueLeaderChangeTouched(CongruenceClass *CC);
   void markMemoryLeaderChangeTouched(CongruenceClass *CC);
-  void addPredicateUsers(const PredicateBase *, Instruction *);
-  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U);
+  void addPredicateUsers(const PredicateBase *, Instruction *) const;
+  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
 
   // Main loop of value numbering
   void iterateTouchedInstructions();
@@ -634,7 +644,7 @@ private:
   void verifyIterationSettled(Function &F);
   bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
   BasicBlock *getBlockForValue(Value *V) const;
-  void deleteExpression(const Expression *E);
+  void deleteExpression(const Expression *E) const;
   unsigned InstrToDFSNum(const Value *V) const {
     assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
     return InstrDFS.lookup(V);
@@ -654,7 +664,7 @@ private:
                ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
                : InstrDFS.lookup(MA);
   }
-  bool isCycleFree(const PHINode *PN);
+  bool isCycleFree(const PHINode *PN) const;
   template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
   // Debug counter info.  When verifying, we have to reset the value numbering
   // debug counter to the same state it started in to get the same results.
@@ -702,7 +712,7 @@ BasicBlock *NewGVN::getBlockForValue(Value *V) const {
 // Delete a definitely dead expression, so it can be reused by the expression
 // allocator.  Some of these are not in creation functions, so we have to accept
 // const versions.
-void NewGVN::deleteExpression(const Expression *E) {
+void NewGVN::deleteExpression(const Expression *E) const {
   assert(isa<BasicExpression>(E));
   auto *BE = cast<BasicExpression>(E);
   const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
@@ -710,7 +720,7 @@ void NewGVN::deleteExpression(const Expression *E) {
 }
 
 PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
-                                           bool &AllConstant) {
+                                           bool &AllConstant) const {
   BasicBlock *PHIBlock = I->getParent();
   auto *PN = cast<PHINode>(I);
   auto *E =
@@ -722,30 +732,46 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
 
   unsigned PHIRPO = RPOOrdering.lookup(DT->getNode(PHIBlock));
 
+  // NewGVN assumes the operands of a PHI node are in a consistent order across
+  // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix
+  // this in LLVM at some point we don't want GVN to find wrong congruences.
+  // Therefore, here we sort uses in predecessor order.
+  // We're sorting the values by pointer. In theory this might be cause of
+  // non-determinism, but here we don't rely on the ordering for anything
+  // significant, e.g. we don't create new instructions based on it so we're
+  // fine.
+  SmallVector<const Use *, 4> PHIOperands;
+  for (const Use &U : PN->operands())
+    PHIOperands.push_back(&U);
+  std::sort(PHIOperands.begin(), PHIOperands.end(),
+            [&](const Use *U1, const Use *U2) {
+              return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2);
+            });
+
   // Filter out unreachable phi operands.
-  auto Filtered = make_filter_range(PN->operands(), [&](const Use &U) {
-    return ReachableEdges.count({PN->getIncomingBlock(U), PHIBlock});
+  auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
+    return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock});
   });
 
   std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
-                 [&](const Use &U) -> Value * {
-                   auto *BB = PN->getIncomingBlock(U);
+                 [&](const Use *U) -> Value * {
+                   auto *BB = PN->getIncomingBlock(*U);
                    auto *DTN = DT->getNode(BB);
                    if (RPOOrdering.lookup(DTN) >= PHIRPO)
                      HasBackedge = true;
-                   AllConstant &= isa<UndefValue>(U) || isa<Constant>(U);
+                   AllConstant &= isa<UndefValue>(*U) || isa<Constant>(*U);
 
                    // Don't try to transform self-defined phis.
-                   if (U == PN)
+                   if (*U == PN)
                      return PN;
-                   return lookupOperandLeader(U);
+                   return lookupOperandLeader(*U);
                  });
   return E;
 }
 
 // Set basic expression info (Arguments, type, opcode) for Expression
 // E from Instruction I in block B.
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
   bool AllConstant = true;
   if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
     E->setType(GEP->getSourceElementType());
@@ -766,7 +792,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
 }
 
 const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
-                                                 Value *Arg1, Value *Arg2) {
+                                                 Value *Arg1,
+                                                 Value *Arg2) const {
   auto *E = new (ExpressionAllocator) BasicExpression(2);
 
   E->setType(T);
@@ -795,7 +822,8 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
 // TODO: Once finished, this should not take an Instruction, we only
 // use it for printing.
 const Expression *NewGVN::checkSimplificationResults(Expression *E,
-                                                     Instruction *I, Value *V) {
+                                                     Instruction *I,
+                                                     Value *V) const {
   if (!V)
     return nullptr;
   if (auto *C = dyn_cast<Constant>(V)) {
@@ -827,7 +855,7 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
   return nullptr;
 }
 
-const Expression *NewGVN::createExpression(Instruction *I) {
+const Expression *NewGVN::createExpression(Instruction *I) const {
   auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
 
   bool AllConstant = setBasicExpressionInfo(I, E);
@@ -913,7 +941,7 @@ const Expression *NewGVN::createExpression(Instruction *I) {
 }
 
 const AggregateValueExpression *
-NewGVN::createAggregateValueExpression(Instruction *I) {
+NewGVN::createAggregateValueExpression(Instruction *I) const {
   if (auto *II = dyn_cast<InsertValueInst>(I)) {
     auto *E = new (ExpressionAllocator)
         AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
@@ -932,32 +960,32 @@ NewGVN::createAggregateValueExpression(Instruction *I) {
   llvm_unreachable("Unhandled type of aggregate value operation");
 }
 
-const VariableExpression *NewGVN::createVariableExpression(Value *V) {
+const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
   auto *E = new (ExpressionAllocator) VariableExpression(V);
   E->setOpcode(V->getValueID());
   return E;
 }
 
-const Expression *NewGVN::createVariableOrConstant(Value *V) {
+const Expression *NewGVN::createVariableOrConstant(Value *V) const {
   if (auto *C = dyn_cast<Constant>(V))
     return createConstantExpression(C);
   return createVariableExpression(V);
 }
 
-const ConstantExpression *NewGVN::createConstantExpression(Constant *C) {
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
   auto *E = new (ExpressionAllocator) ConstantExpression(C);
   E->setOpcode(C->getValueID());
   return E;
 }
 
-const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) {
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
   auto *E = new (ExpressionAllocator) UnknownExpression(I);
   E->setOpcode(I->getOpcode());
   return E;
 }
 
-const CallExpression *NewGVN::createCallExpression(CallInst *CI,
-                                                   const MemoryAccess *MA) {
+const CallExpression *
+NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
   // FIXME: Add operand bundles for calls.
   auto *E =
       new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
@@ -1017,9 +1045,8 @@ Value *NewGVN::lookupOperandLeader(Value *V) const {
 const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
   auto *CC = getMemoryClass(MA);
   assert(CC->getMemoryLeader() &&
-         "Every MemoryAccess should be mapped to a "
-         "congruence class with a represenative memory "
-         "access");
+         "Every MemoryAccess should be mapped to a congruence class with a "
+         "representative memory access");
   return CC->getMemoryLeader();
 }
 
@@ -1032,7 +1059,7 @@ bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const {
 
 LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
                                              LoadInst *LI,
-                                             const MemoryAccess *MA) {
+                                             const MemoryAccess *MA) const {
   auto *E =
       new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
@@ -1050,8 +1077,8 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
   return E;
 }
 
-const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
-                                                     const MemoryAccess *MA) {
+const StoreExpression *
+NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
   auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
   auto *E = new (ExpressionAllocator)
       StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
@@ -1068,7 +1095,7 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
   return E;
 }
 
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
   // Unlike loads, we never try to eliminate stores, so we do not check if they
   // are simple and avoid value numbering them.
   auto *SI = cast<StoreInst>(I);
@@ -1126,7 +1153,7 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
 const Expression *
 NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
                                     LoadInst *LI, Instruction *DepInst,
-                                    MemoryAccess *DefiningAccess) {
+                                    MemoryAccess *DefiningAccess) const {
   assert((!LI || LI->isSimple()) && "Not a simple load");
   if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
     // Can't forward from non-atomic to atomic without violating memory model.
@@ -1201,7 +1228,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
   return nullptr;
 }
 
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
   auto *LI = cast<LoadInst>(I);
 
   // We can eliminate in favor of non-simple loads, but we won't be able to
@@ -1239,7 +1266,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
 }
 
 const Expression *
-NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
   auto *PI = PredInfo->getPredicateInfoFor(I);
   if (!PI)
     return nullptr;
@@ -1284,7 +1311,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
     return nullptr;
 
   if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
-    DEBUG(dbgs() << "Copy is not of any condition operands!");
+    DEBUG(dbgs() << "Copy is not of any condition operands!\n");
     return nullptr;
   }
   Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
@@ -1329,7 +1356,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
 }
 
 // Evaluate read only and pure calls, and create an expression result.
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
   auto *CI = cast<CallInst>(I);
   if (auto *II = dyn_cast<IntrinsicInst>(I)) {
     // Instrinsics with the returned attribute are copies of arguments.
@@ -1366,8 +1393,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
   DEBUG(dbgs() << "Setting " << *From);
   DEBUG(dbgs() << " equivalent to congruence class ");
   DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
-  DEBUG(dbgs() << *NewClass->getMemoryLeader());
-  DEBUG(dbgs() << "\n");
+  DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
 
   auto LookupResult = MemoryAccessToClass.find(From);
   bool Changed = false;
@@ -1381,7 +1407,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
         NewClass->memory_insert(MP);
         // This may have killed the class if it had no non-memory members
         if (OldClass->getMemoryLeader() == From) {
-          if (OldClass->memory_empty()) {
+          if (OldClass->definesNoMemory()) {
             OldClass->setMemoryLeader(nullptr);
           } else {
             OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
@@ -1406,7 +1432,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
 // Determine if a phi is cycle-free.  That means the values in the phi don't
 // depend on any expressions that can change value as a result of the phi.
 // For example, a non-cycle free phi would be  v = phi(0, v+1).
-bool NewGVN::isCycleFree(const PHINode *PN) {
+bool NewGVN::isCycleFree(const PHINode *PN) const {
   // In order to compute cycle-freeness, we do SCC finding on the phi, and see
   // what kind of SCC it ends up in.  If it is a singleton, it is cycle-free.
   // If it is not in a singleton, it is only cycle free if the other members are
@@ -1436,7 +1462,7 @@ bool NewGVN::isCycleFree(const PHINode *PN) {
 }
 
 // Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
   // True if one of the incoming phi edges is a backedge.
   bool HasBackedge = false;
   // All constant tracks the state of whether all the *original* phi operands
@@ -1510,7 +1536,8 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
   return E;
 }
 
-const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
+const Expression *
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
   if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
     auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
     if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
@@ -1548,7 +1575,7 @@ const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
 
   return createAggregateValueExpression(I);
 }
-const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
   auto *CI = dyn_cast<CmpInst>(I);
   // See if our operands are equal to those of a previous predicate, and if so,
   // if it implies true or false.
@@ -1663,7 +1690,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
 }
 
 // Substitute and symbolize the value before value numbering.
-const Expression *NewGVN::performSymbolicEvaluation(Value *V) {
+const Expression *NewGVN::performSymbolicEvaluation(Value *V) const {
   const Expression *E = nullptr;
   if (auto *C = dyn_cast<Constant>(V))
     E = createConstantExpression(C);
@@ -1749,7 +1776,7 @@ void NewGVN::markUsersTouched(Value *V) {
   }
 }
 
-void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) {
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
   DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
   MemoryToUsers[To].insert(U);
 }
@@ -1772,7 +1799,7 @@ void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
 }
 
 // Add I to the set of users of a given predicate.
-void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) {
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
   if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
     PredicateToUsers[PBranch->Condition].insert(I);
   else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
@@ -1825,8 +1852,7 @@ const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
   // TODO: If this ends up to slow, we can maintain a next memory leader like we
   // do for regular leaders.
   // Make sure there will be a leader to find
-  assert((CC->getStoreCount() > 0 || !CC->memory_empty()) &&
-         "Can't get next leader if there is none");
+  assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
   if (CC->getStoreCount() > 0) {
     if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
       return MSSA->getMemoryAccess(NL);
@@ -1898,7 +1924,7 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
   setMemoryClass(InstMA, NewClass);
   // Now, fixup the old class if necessary
   if (OldClass->getMemoryLeader() == InstMA) {
-    if (OldClass->getStoreCount() != 0 || !OldClass->memory_empty()) {
+    if (!OldClass->definesNoMemory()) {
       OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
       DEBUG(dbgs() << "Memory class leader change for class "
                    << OldClass->getID() << " to "
@@ -1956,10 +1982,9 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
     if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
       // If it's a store expression we are using, it means we are not equivalent
       // to something earlier.
-      if (isa<StoreExpression>(E)) {
-        assert(lookupOperandLeader(SI->getValueOperand()) !=
-               NewClass->getLeader());
-        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+      if (auto *SE = dyn_cast<StoreExpression>(E)) {
+        assert(SE->getStoredValue() != NewClass->getLeader());
+        NewClass->setStoredValue(SE->getStoredValue());
         markValueLeaderChangeTouched(NewClass);
         // Shift the new class leader to be the store
         DEBUG(dbgs() << "Changing leader of congruence class "
@@ -1985,7 +2010,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
   // See if we destroyed the class or need to swap leaders.
   if (OldClass->empty() && OldClass != TOPClass) {
     if (OldClass->getDefiningExpr()) {
-      DEBUG(dbgs() << "Erasing expression " << OldClass->getDefiningExpr()
+      DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
                    << " from table\n");
       ExpressionToClass.erase(OldClass->getDefiningExpr());
     }
@@ -2064,7 +2089,7 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
       } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
         StoreInst *SI = SE->getStoreInst();
         NewClass->setLeader(SI);
-        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+        NewClass->setStoredValue(SE->getStoredValue());
         // The RepMemoryAccess field will be filled in properly by the
         // moveValueToNewCongruenceClass call.
       } else {
@@ -2523,6 +2548,19 @@ void NewGVN::verifyMemoryCongruency() const {
           return false;
         if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
           return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+
+        // We could have phi nodes which operands are all trivially dead,
+        // so we don't process them.
+        if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
+          for (auto &U : MemPHI->incoming_values()) {
+            if (Instruction *I = dyn_cast<Instruction>(U.get())) {
+              if (!isInstructionTriviallyDead(I))
+                return true;
+            }
+          }
+          return false;
+        }
+
         return true;
       };
 
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index fb1b47c48276..4f608c97147d 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -55,7 +55,7 @@ static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
 /// Update the dominator tree after removing one exiting predecessor of a loop
 /// exit block.
 static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L,
-                                            DominatorTree &DT) {
+                               DominatorTree &DT) {
   assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) &&
          "Cannot have empty predecessors of the loop exit block if we split "
          "off a block to unswitch!");
@@ -137,6 +137,98 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
   }
 }
 
+/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
+/// incoming values along this edge.
+static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
+                                         BasicBlock &ExitBB) {
+  for (Instruction &I : ExitBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      return true;
+
+    // If the incoming value for this edge isn't loop invariant the unswitch
+    // won't be trivial.
+    if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
+      return false;
+  }
+  llvm_unreachable("Basic blocks should never be empty!");
+}
+
+/// Rewrite the PHI nodes in an unswitched loop exit basic block.
+///
+/// Requires that the loop exit and unswitched basic block are the same, and
+/// that the exiting block was a unique predecessor of that block. Rewrites the
+/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
+/// PHI nodes from the old preheader that now contains the unswitched
+/// terminator.
+static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
+                                                  BasicBlock &OldExitingBB,
+                                                  BasicBlock &OldPH) {
+  for (Instruction &I : UnswitchedBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      break;
+
+    // When the loop exit is directly unswitched we just need to update the
+    // incoming basic block. We loop to handle weird cases with repeated
+    // incoming blocks, but expect to typically only have one operand here.
+    for (auto i : llvm::seq<int>(0, PN->getNumOperands())) {
+      assert(PN->getIncomingBlock(i) == &OldExitingBB &&
+             "Found incoming block different from unique predecessor!");
+      PN->setIncomingBlock(i, &OldPH);
+    }
+  }
+}
+
+/// Rewrite the PHI nodes in the loop exit basic block and the split off
+/// unswitched block.
+///
+/// Because the exit block remains an exit from the loop, this rewrites the
+/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
+/// nodes into the unswitched basic block to select between the value in the
+/// old preheader and the loop exit.
+static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
+                                                      BasicBlock &UnswitchedBB,
+                                                      BasicBlock &OldExitingBB,
+                                                      BasicBlock &OldPH) {
+  assert(&ExitBB != &UnswitchedBB &&
+         "Must have different loop exit and unswitched blocks!");
+  Instruction *InsertPt = &*UnswitchedBB.begin();
+  for (Instruction &I : ExitBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      break;
+
+    auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2,
+                                  PN->getName() + ".split", InsertPt);
+
+    // Walk backwards over the old PHI node's inputs to minimize the cost of
+    // removing each one. We have to do this weird loop manually so that we
+    // create the same number of new incoming edges in the new PHI as we expect
+    // each case-based edge to be included in the unswitched switch in some
+    // cases.
+    // FIXME: This is really, really gross. It would be much cleaner if LLVM
+    // allowed us to create a single entry for a predecessor block without
+    // having separate entries for each "edge" even though these edges are
+    // required to produce identical results.
+    for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+      if (PN->getIncomingBlock(i) != &OldExitingBB)
+        continue;
+
+      Value *Incoming = PN->removeIncomingValue(i);
+      NewPN->addIncoming(Incoming, &OldPH);
+    }
+
+    // Now replace the old PHI with the new one and wire the old one in as an
+    // input to the new one.
+    PN->replaceAllUsesWith(NewPN);
+    NewPN->addIncoming(PN, &ExitBB);
+  }
+}
+
 /// Unswitch a trivial branch if the condition is loop invariant.
 ///
 /// This routine should only be called when loop code leading to the branch has
@@ -187,10 +279,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   assert(L.contains(ContinueBB) &&
          "Cannot have both successors exit and still be in the loop!");
 
-  // If the loop exit block contains phi nodes, this isn't trivial.
-  // FIXME: We should examine the PHI to determine whether or not we can handle
-  // it trivially.
-  if (isa<PHINode>(LoopExitBB->begin()))
+  auto *ParentBB = BI.getParent();
+  if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
     return false;
 
   DEBUG(dbgs() << "    unswitching trivial branch when: " << CondVal
@@ -209,14 +299,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   BasicBlock *UnswitchedBB;
   if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
     (void)PredBB;
-    assert(PredBB == BI.getParent() && "A branch's parent is't a predecessor!");
+    assert(PredBB == BI.getParent() &&
+           "A branch's parent isn't a predecessor!");
     UnswitchedBB = LoopExitBB;
   } else {
     UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
   }
 
-  BasicBlock *ParentBB = BI.getParent();
-
   // Now splice the branch to gate reaching the new preheader and re-point its
   // successors.
   OldPH->getInstList().splice(std::prev(OldPH->end()),
@@ -229,6 +318,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   // terminator.
   BranchInst::Create(ContinueBB, ParentBB);
 
+  // Rewrite the relevant PHI nodes.
+  if (UnswitchedBB == LoopExitBB)
+    rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
+  else
+    rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
+                                              *ParentBB, *OldPH);
+
   // Now we need to update the dominator tree.
   updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
   // But if we split something off of the loop exit block then we also removed
@@ -278,6 +374,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   if (!L.isLoopInvariant(LoopCond))
     return false;
 
+  auto *ParentBB = SI.getParent();
+
   // FIXME: We should compute this once at the start and update it!
   SmallVector<BasicBlock *, 16> ExitBlocks;
   L.getExitBlocks(ExitBlocks);
@@ -287,12 +385,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   SmallVector<int, 4> ExitCaseIndices;
   for (auto Case : SI.cases()) {
     auto *SuccBB = Case.getCaseSuccessor();
-    if (ExitBlockSet.count(SuccBB) && !isa<PHINode>(SuccBB->begin()))
+    if (ExitBlockSet.count(SuccBB) &&
+        areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
       ExitCaseIndices.push_back(Case.getCaseIndex());
   }
   BasicBlock *DefaultExitBB = nullptr;
   if (ExitBlockSet.count(SI.getDefaultDest()) &&
-      !isa<PHINode>(SI.getDefaultDest()->begin()) &&
+      areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
       !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
     DefaultExitBB = SI.getDefaultDest();
   else if (ExitCaseIndices.empty())
@@ -330,7 +429,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     if (CommonSuccBB) {
       SI.setDefaultDest(CommonSuccBB);
     } else {
-      BasicBlock *ParentBB = SI.getParent();
       BasicBlock *UnreachableBB = BasicBlock::Create(
           ParentBB->getContext(),
           Twine(ParentBB->getName()) + ".unreachable_default",
@@ -358,30 +456,44 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
   // Now add the unswitched switch.
   auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
 
-  // Split any exit blocks with remaining in-loop predecessors. We walk in
-  // reverse so that we split in the same order as the cases appeared. This is
-  // purely for convenience of reading the resulting IR, but it doesn't cost
-  // anything really.
+  // Rewrite the IR for the unswitched basic blocks. This requires two steps.
+  // First, we split any exit blocks with remaining in-loop predecessors. Then
+  // we update the PHIs in one of two ways depending on if there was a split.
+  // We walk in reverse so that we split in the same order as the cases
+  // appeared. This is purely for convenience of reading the resulting IR, but
+  // it doesn't cost anything really.
+  SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
   SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
   // Handle the default exit if necessary.
   // FIXME: It'd be great if we could merge this with the loop below but LLVM's
   // ranges aren't quite powerful enough yet.
-  if (DefaultExitBB && !pred_empty(DefaultExitBB)) {
-    auto *SplitBB =
-        SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
-    updateLoopExitIDom(DefaultExitBB, L, DT);
-    DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+  if (DefaultExitBB) {
+    if (pred_empty(DefaultExitBB)) {
+      UnswitchedExitBBs.insert(DefaultExitBB);
+      rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
+    } else {
+      auto *SplitBB =
+          SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
+      rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+                                                *ParentBB, *OldPH);
+      updateLoopExitIDom(DefaultExitBB, L, DT);
+      DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+    }
   }
   // Note that we must use a reference in the for loop so that we update the
   // container.
   for (auto &CasePair : reverse(ExitCases)) {
     // Grab a reference to the exit block in the pair so that we can update it.
-    BasicBlock *&ExitBB = CasePair.second;
+    BasicBlock *ExitBB = CasePair.second;
 
     // If this case is the last edge into the exit block, we can simply reuse it
     // as it will no longer be a loop exit. No mapping necessary.
-    if (pred_empty(ExitBB))
+    if (pred_empty(ExitBB)) {
+      // Only rewrite once.
+      if (UnswitchedExitBBs.insert(ExitBB).second)
+        rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
       continue;
+    }
 
     // Otherwise we need to split the exit block so that we retain an exit
     // block from the loop and a target for the unswitched condition.
@@ -389,9 +501,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     if (!SplitExitBB) {
       // If this is the first time we see this, do the split and remember it.
       SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+      rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+                                                *ParentBB, *OldPH);
       updateLoopExitIDom(ExitBB, L, DT);
     }
-    ExitBB = SplitExitBB;
+    // Update the case pair to point to the split block.
+    CasePair.second = SplitExitBB;
   }
 
   // Now add the unswitched cases. We do this in reverse order as we built them
diff --git a/lib/Transforms/Scalar/SpeculativeExecution.cpp b/lib/Transforms/Scalar/SpeculativeExecution.cpp
index a0fc966cee2c..a7c308b59877 100644
--- a/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -208,6 +208,47 @@ bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
   return false;
 }
 
+static unsigned ComputeSpeculationCost(const Instruction *I,
+                                       const TargetTransformInfo &TTI) {
+  switch (Operator::getOpcode(I)) {
+    case Instruction::GetElementPtr:
+    case Instruction::Add:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Select:
+    case Instruction::Shl:
+    case Instruction::Sub:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::Xor:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::Call:
+    case Instruction::BitCast:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::AddrSpaceCast:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc:
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+      return TTI.getUserCost(I);
+
+    default:
+      return UINT_MAX; // Disallow anything not whitelisted.
+  }
+}
+
 bool SpeculativeExecutionPass::considerHoistingFromTo(
     BasicBlock &FromBlock, BasicBlock &ToBlock) {
   SmallSet<const Instruction *, 8> NotHoisted;
@@ -223,7 +264,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
 
   unsigned TotalSpeculationCost = 0;
   for (auto& I : FromBlock) {
-    const unsigned Cost = TTI->getUserCost(&I);
+    const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
     if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
         AllPrecedingUsesFromBlockHoisted(&I)) {
       TotalSpeculationCost += Cost;
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index 7ffdad597a9b..83ec7f55d1af 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -261,10 +261,10 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V,
 
   computeKnownBits(V, Known, DL);
 
-  if (Known.Zero.countLeadingOnes() >= HiBits)
+  if (Known.countMinLeadingZeros() >= HiBits)
     return VALRNG_KNOWN_SHORT;
 
-  if (Known.One.countLeadingZeros() < HiBits)
+  if (Known.countMaxLeadingZeros() < HiBits)
     return VALRNG_LIKELY_LONG;
 
   // Long integer divisions are often used in hashtable implementations. It's
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index d5124ac89016..4aa26fd14fee 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -41,6 +41,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
                                   ValueToValueMapTy &VMap,
                                   const Twine &NameSuffix, Function *F,
                                   ClonedCodeInfo *CodeInfo) {
+  DenseMap<const MDNode *, MDNode *> Cache;
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
   if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
 
@@ -50,6 +51,9 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
   for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
        II != IE; ++II) {
     Instruction *NewInst = II->clone();
+    if (F && F->getSubprogram())
+      DebugLoc::reparentDebugInfo(*NewInst, BB->getParent()->getSubprogram(),
+                                  F->getSubprogram(), Cache);
     if (II->hasName())
       NewInst->setName(II->getName()+NameSuffix);
     NewBB->getInstList().push_back(NewInst);
@@ -120,12 +124,28 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 
   SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
   OldFunc->getAllMetadata(MDs);
-  for (auto MD : MDs)
-    NewFunc->addMetadata(
-        MD.first,
-        *MapMetadata(MD.second, VMap,
-                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                     TypeMapper, Materializer));
+  for (auto MD : MDs) {
+    MDNode *NewMD;
+    bool MustCloneSP =
+        (MD.first == LLVMContext::MD_dbg && OldFunc->getParent() &&
+         OldFunc->getParent() == NewFunc->getParent());
+    if (MustCloneSP) {
+      auto *SP = cast<DISubprogram>(MD.second);
+      NewMD = DISubprogram::getDistinct(
+          NewFunc->getContext(), SP->getScope(), SP->getName(),
+          NewFunc->getName(), SP->getFile(), SP->getLine(), SP->getType(),
+          SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(),
+          SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(),
+          SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(),
+          SP->getUnit(), SP->getTemplateParams(), SP->getDeclaration(),
+          SP->getVariables(), SP->getThrownTypes());
+    } else
+      NewMD =
+          MapMetadata(MD.second, VMap,
+                      ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                      TypeMapper, Materializer);
+    NewFunc->addMetadata(MD.first, *NewMD);
+  }
 
   // Loop over all of the basic blocks in the function, cloning them as
   // appropriate.  Note that we save BE this way in order to handle cloning of
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 4e9d67252d6c..5444b752de82 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -96,7 +96,7 @@ std::unique_ptr<Module> llvm::CloneModule(
       else
         GV = new GlobalVariable(
             *New, I->getValueType(), false, GlobalValue::ExternalLinkage,
-            (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr,
+            nullptr, I->getName(), nullptr,
             I->getThreadLocalMode(), I->getType()->getAddressSpace());
       VMap[&*I] = GV;
       // We do not copy attributes (mainly because copying between different
diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp
index 8c2386554da5..78d7474e5b95 100644
--- a/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -67,8 +67,7 @@ IRBuilder<> *EscapeEnumerator::Next() {
   // Create a cleanup block.
   LLVMContext &C = F.getContext();
   BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
-  Type *ExnTy =
-      StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr);
+  Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C));
   if (!F.hasPersonalityFn()) {
     Constant *PersFn = getDefaultPersonalityFn(F.getParent());
     F.setPersonalityFn(PersFn);
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 6d56e08af99f..9cb4762b683c 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -1302,41 +1302,6 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {
   return false;
 }
 
-/// Rebuild the entire inlined-at chain for this instruction so that the top of
-/// the chain now is inlined-at the new call site.
-static DebugLoc
-updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode,
-                    LLVMContext &Ctx,
-                    DenseMap<const DILocation *, DILocation *> &IANodes) {
-  SmallVector<DILocation *, 3> InlinedAtLocations;
-  DILocation *Last = InlinedAtNode;
-  DILocation *CurInlinedAt = DL;
-
-  // Gather all the inlined-at nodes
-  while (DILocation *IA = CurInlinedAt->getInlinedAt()) {
-    // Skip any we've already built nodes for
-    if (DILocation *Found = IANodes[IA]) {
-      Last = Found;
-      break;
-    }
-
-    InlinedAtLocations.push_back(IA);
-    CurInlinedAt = IA;
-  }
-
-  // Starting from the top, rebuild the nodes to point to the new inlined-at
-  // location (then rebuilding the rest of the chain behind it) and update the
-  // map of already-constructed inlined-at nodes.
-  for (const DILocation *MD : reverse(InlinedAtLocations)) {
-    Last = IANodes[MD] = DILocation::getDistinct(
-        Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
-  }
-
-  // And finally create the normal location for this instruction, referring to
-  // the new inlined-at chain.
-  return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last);
-}
-
 /// Return the result of AI->isStaticAlloca() if AI were moved to the entry
 /// block. Allocas used in inalloca calls and allocas of dynamic array size
 /// cannot be static.
@@ -1364,14 +1329,16 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
   // Cache the inlined-at nodes as they're built so they are reused, without
   // this every instruction's inlined-at chain would become distinct from each
   // other.
-  DenseMap<const DILocation *, DILocation *> IANodes;
+  DenseMap<const MDNode *, MDNode *> IANodes;
 
   for (; FI != Fn->end(); ++FI) {
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
       if (DebugLoc DL = BI->getDebugLoc()) {
-        BI->setDebugLoc(
-            updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes));
+        auto IA = DebugLoc::appendInlinedAt(DL, InlinedAtNode, BI->getContext(),
+                                            IANodes);
+        auto IDL = DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), IA);
+        BI->setDebugLoc(IDL);
         continue;
       }
 
@@ -1429,11 +1396,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
 /// Update the branch metadata for cloned call instructions.
 static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
                               const Optional<uint64_t> &CalleeEntryCount,
-                              const Instruction *TheCall) {
+                              const Instruction *TheCall,
+                              ProfileSummaryInfo *PSI) {
   if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
     return;
   Optional<uint64_t> CallSiteCount =
-      ProfileSummaryInfo::getProfileCount(TheCall, nullptr);
+      PSI ? PSI->getProfileCount(TheCall, nullptr) : None;
   uint64_t CallCount =
       std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
                CalleeEntryCount.getValue());
@@ -1456,16 +1424,16 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
 /// The callsite's block count is subtracted from the callee's function entry
 /// count.
 static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
-                              Instruction *CallInst, Function *Callee) {
+                              Instruction *CallInst, Function *Callee,
+                              ProfileSummaryInfo *PSI) {
   // If the callee has a original count of N, and the estimated count of
   // callsite is M, the new callee count is set to N - M. M is estimated from
   // the caller's entry count, its entry block frequency and the block frequency
   // of the callsite.
   Optional<uint64_t> CalleeCount = Callee->getEntryCount();
-  if (!CalleeCount.hasValue())
+  if (!CalleeCount.hasValue() || !PSI)
     return;
-  Optional<uint64_t> CallCount =
-      ProfileSummaryInfo::getProfileCount(CallInst, CallerBFI);
+  Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI);
   if (!CallCount.hasValue())
     return;
   // Since CallSiteCount is an estimate, it could exceed the original callee
@@ -1668,9 +1636,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
                       CalledFunc->front());
 
-    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall);
+    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall,
+                      IFI.PSI);
     // Update the profile count of callee.
-    updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc);
+    updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI);
 
     // Inject byval arguments initialization.
     for (std::pair<Value*, Value*> &Init : ByValInit)
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index 8a1973d1db05..53b432fcafd4 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -26,16 +26,15 @@ namespace {
     InstNamer() : FunctionPass(ID) {
       initializeInstNamerPass(*PassRegistry::getPassRegistry());
     }
-    
+
     void getAnalysisUsage(AnalysisUsage &Info) const override {
       Info.setPreservesAll();
     }
 
     bool runOnFunction(Function &F) override {
-      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
-           AI != AE; ++AI)
-        if (!AI->hasName() && !AI->getType()->isVoidTy())
-          AI->setName("arg");
+      for (auto &Arg : F.args())
+        if (!Arg.hasName())
+          Arg.setName("arg");
 
       for (BasicBlock &BB : F) {
         if (!BB.hasName())
@@ -48,11 +47,11 @@ namespace {
       return true;
     }
   };
-  
+
   char InstNamer::ID = 0;
 }
 
-INITIALIZE_PASS(InstNamer, "instnamer", 
+INITIALIZE_PASS(InstNamer, "instnamer",
                 "Assign names to anonymous instructions", false, false)
 char &llvm::InstructionNamerID = InstNamer::ID;
 //===----------------------------------------------------------------------===//
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index ce6b703f3528..1ca509472b5f 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1041,7 +1041,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
 
   KnownBits Known(BitWidth);
   computeKnownBits(V, Known, DL, 0, AC, CxtI, DT);
-  unsigned TrailZ = Known.Zero.countTrailingOnes();
+  unsigned TrailZ = Known.countMinTrailingZeros();
 
   // Avoid trouble with ridiculously large TrailZ values, such as
   // those computed from a null pointer.
@@ -1105,8 +1105,9 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
 void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            StoreInst *SI, DIBuilder &Builder) {
   auto *DIVar = DDI->getVariable();
-  auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
+  auto *DIExpr = DDI->getExpression();
+  Value *DV = SI->getOperand(0);
 
   // If an argument is zero extended then use argument directly. The ZExt
   // may be zapped by an optimization pass in future.
@@ -1116,34 +1117,28 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
     ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
   if (ExtendedArg) {
-    // We're now only describing a subset of the variable. The fragment we're
-    // describing will always be smaller than the variable size, because
-    // VariableSize == Size of Alloca described by DDI. Since SI stores
-    // to the alloca described by DDI, if it's first operand is an extend,
-    // we're guaranteed that before extension, the value was narrower than
-    // the size of the alloca, hence the size of the described variable.
-    SmallVector<uint64_t, 3> Ops;
-    unsigned FragmentOffset = 0;
-    // If this already is a bit fragment, we drop the bit fragment from the
-    // expression and record the offset.
-    auto Fragment = DIExpr->getFragmentInfo();
-    if (Fragment) {
-      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
-      FragmentOffset = Fragment->OffsetInBits;
-    } else {
-      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+    // If this DDI was already describing only a fragment of a variable, ensure
+    // that fragment is appropriately narrowed here.
+    // But if a fragment wasn't used, describe the value as the original
+    // argument (rather than the zext or sext) so that it remains described even
+    // if the sext/zext is optimized away. This widens the variable description,
+    // leaving it up to the consumer to know how the smaller value may be
+    // represented in a larger register.
+    if (auto Fragment = DIExpr->getFragmentInfo()) {
+      unsigned FragmentOffset = Fragment->OffsetInBits;
+      SmallVector<uint64_t, 3> Ops(DIExpr->elements_begin(),
+                                   DIExpr->elements_end() - 3);
+      Ops.push_back(dwarf::DW_OP_LLVM_fragment);
+      Ops.push_back(FragmentOffset);
+      const DataLayout &DL = DDI->getModule()->getDataLayout();
+      Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
+      DIExpr = Builder.createExpression(Ops);
     }
-    Ops.push_back(dwarf::DW_OP_LLVM_fragment);
-    Ops.push_back(FragmentOffset);
-    const DataLayout &DL = DDI->getModule()->getDataLayout();
-    Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
-    auto NewDIExpr = Builder.createExpression(Ops);
-    if (!LdStHasDebugValue(DIVar, NewDIExpr, SI))
-      Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr,
-                                      DDI->getDebugLoc(), SI);
-  } else if (!LdStHasDebugValue(DIVar, DIExpr, SI))
-    Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr,
-                                    DDI->getDebugLoc(), SI);
+    DV = ExtendedArg;
+  }
+  if (!LdStHasDebugValue(DIVar, DIExpr, SI))
+    Builder.insertDbgValueIntrinsic(DV, 0, DIVar, DIExpr, DDI->getDebugLoc(),
+                                    SI);
 }
 
 /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
@@ -1781,44 +1776,43 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J) {
   combineMetadata(K, J, KnownIDs);
 }
 
-unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
-                                        DominatorTree &DT,
-                                        const BasicBlockEdge &Root) {
+template <typename RootType, typename DominatesFn>
+static unsigned replaceDominatedUsesWith(Value *From, Value *To,
+                                         const RootType &Root,
+                                         const DominatesFn &Dominates) {
   assert(From->getType() == To->getType());
-  
+
   unsigned Count = 0;
   for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
-       UI != UE; ) {
+       UI != UE;) {
     Use &U = *UI++;
-    if (DT.dominates(Root, U)) {
-      U.set(To);
-      DEBUG(dbgs() << "Replace dominated use of '"
-            << From->getName() << "' as "
-            << *To << " in " << *U << "\n");
-      ++Count;
-    }
+    if (!Dominates(Root, U))
+      continue;
+    U.set(To);
+    DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
+                 << *To << " in " << *U << "\n");
+    ++Count;
   }
   return Count;
 }
 
 unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
                                         DominatorTree &DT,
-                                        const BasicBlock *BB) {
-  assert(From->getType() == To->getType());
+                                        const BasicBlockEdge &Root) {
+  auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) {
+    return DT.dominates(Root, U);
+  };
+  return ::replaceDominatedUsesWith(From, To, Root, Dominates);
+}
 
-  unsigned Count = 0;
-  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
-       UI != UE;) {
-    Use &U = *UI++;
-    auto *I = cast<Instruction>(U.getUser());
-    if (DT.properlyDominates(BB, I->getParent())) {
-      U.set(To);
-      DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
-                   << *To << " in " << *U << "\n");
-      ++Count;
-    }
-  }
-  return Count;
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+                                        DominatorTree &DT,
+                                        const BasicBlock *BB) {
+  auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) {
+    auto *I = cast<Instruction>(U.getUser())->getParent();
+    return DT.properlyDominates(BB, I);
+  };
+  return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates);
 }
 
 bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 175d013a011d..81f033e7d51a 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -1112,3 +1113,203 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   else
     return (FalseVal + (TrueVal / 2)) / TrueVal;
 }
+
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V)) {
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+    cast<Instruction>(V)->setFastMathFlags(Flags);
+  }
+  return V;
+}
+
+// Helper to generate a log2 shuffle reduction.
+Value *
+llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
+                          RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+                          ArrayRef<Value *> RedOps) {
+  unsigned VF = Src->getType()->getVectorNumElements();
+  // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+  // and vector ops, reducing the set of values being computed by half each
+  // round.
+  assert(isPowerOf2_32(VF) &&
+         "Reduction emission only supported for pow2 vectors!");
+  Value *TmpVec = Src;
+  SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
+  for (unsigned i = VF; i != 1; i >>= 1) {
+    // Move the upper half of the vector to the lower half.
+    for (unsigned j = 0; j != i / 2; ++j)
+      ShuffleMask[j] = Builder.getInt32(i / 2 + j);
+
+    // Fill the rest of the mask with undef.
+    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
+              UndefValue::get(Builder.getInt32Ty()));
+
+    Value *Shuf = Builder.CreateShuffleVector(
+        TmpVec, UndefValue::get(TmpVec->getType()),
+        ConstantVector::get(ShuffleMask), "rdx.shuf");
+
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+      // Floating point operations had to be 'fast' to enable the reduction.
+      TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op,
+                                                   TmpVec, Shuf, "bin.rdx"));
+    } else {
+      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+             "Invalid min/max");
+      TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec,
+                                                    Shuf);
+    }
+    if (!RedOps.empty())
+      propagateIRFlags(TmpVec, RedOps);
+  }
+  // The result is in the first element of the vector.
+  return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+}
+
+/// Create a simple vector reduction specified by an opcode and some
+/// flags (if generating min/max reductions).
+Value *llvm::createSimpleTargetReduction(
+    IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
+    Value *Src, TargetTransformInfo::ReductionFlags Flags,
+    ArrayRef<Value *> RedOps) {
+  assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
+
+  Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType());
+  std::function<Value*()> BuildFunc;
+  using RD = RecurrenceDescriptor;
+  RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
+  // TODO: Support creating ordered reductions.
+  FastMathFlags FMFUnsafe;
+  FMFUnsafe.setUnsafeAlgebra();
+
+  switch (Opcode) {
+  case Instruction::Add:
+    BuildFunc = [&]() { return Builder.CreateAddReduce(Src); };
+    break;
+  case Instruction::Mul:
+    BuildFunc = [&]() { return Builder.CreateMulReduce(Src); };
+    break;
+  case Instruction::And:
+    BuildFunc = [&]() { return Builder.CreateAndReduce(Src); };
+    break;
+  case Instruction::Or:
+    BuildFunc = [&]() { return Builder.CreateOrReduce(Src); };
+    break;
+  case Instruction::Xor:
+    BuildFunc = [&]() { return Builder.CreateXorReduce(Src); };
+    break;
+  case Instruction::FAdd:
+    BuildFunc = [&]() {
+      auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src);
+      cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+      return Rdx;
+    };
+    break;
+  case Instruction::FMul:
+    BuildFunc = [&]() {
+      auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src);
+      cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+      return Rdx;
+    };
+    break;
+  case Instruction::ICmp:
+    if (Flags.IsMaxOp) {
+      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax;
+      BuildFunc = [&]() {
+        return Builder.CreateIntMaxReduce(Src, Flags.IsSigned);
+      };
+    } else {
+      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin;
+      BuildFunc = [&]() {
+        return Builder.CreateIntMinReduce(Src, Flags.IsSigned);
+      };
+    }
+    break;
+  case Instruction::FCmp:
+    if (Flags.IsMaxOp) {
+      MinMaxKind = RD::MRK_FloatMax;
+      BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); };
+    } else {
+      MinMaxKind = RD::MRK_FloatMin;
+      BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); };
+    }
+    break;
+  default:
+    llvm_unreachable("Unhandled opcode");
+    break;
+  }
+  if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
+    return BuildFunc();
+  return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
+}
+
+/// Create a vector reduction using a given recurrence descriptor.
+Value *llvm::createTargetReduction(IRBuilder<> &Builder,
+                                   const TargetTransformInfo *TTI,
+                                   RecurrenceDescriptor &Desc, Value *Src,
+                                   bool NoNaN) {
+  // TODO: Support in-order reductions based on the recurrence descriptor.
+  RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind();
+  TargetTransformInfo::ReductionFlags Flags;
+  Flags.NoNaN = NoNaN;
+  auto getSimpleRdx = [&](unsigned Opc) {
+    return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags);
+  };
+  switch (RecKind) {
+  case RecurrenceDescriptor::RK_FloatAdd:
+    return getSimpleRdx(Instruction::FAdd);
+  case RecurrenceDescriptor::RK_FloatMult:
+    return getSimpleRdx(Instruction::FMul);
+  case RecurrenceDescriptor::RK_IntegerAdd:
+    return getSimpleRdx(Instruction::Add);
+  case RecurrenceDescriptor::RK_IntegerMult:
+    return getSimpleRdx(Instruction::Mul);
+  case RecurrenceDescriptor::RK_IntegerAnd:
+    return getSimpleRdx(Instruction::And);
+  case RecurrenceDescriptor::RK_IntegerOr:
+    return getSimpleRdx(Instruction::Or);
+  case RecurrenceDescriptor::RK_IntegerXor:
+    return getSimpleRdx(Instruction::Xor);
+  case RecurrenceDescriptor::RK_IntegerMinMax: {
+    switch (Desc.getMinMaxRecurrenceKind()) {
+    case RecurrenceDescriptor::MRK_SIntMax:
+      Flags.IsSigned = true;
+      Flags.IsMaxOp = true;
+      break;
+    case RecurrenceDescriptor::MRK_UIntMax:
+      Flags.IsMaxOp = true;
+      break;
+    case RecurrenceDescriptor::MRK_SIntMin:
+      Flags.IsSigned = true;
+      break;
+    case RecurrenceDescriptor::MRK_UIntMin:
+      break;
+    default:
+      llvm_unreachable("Unhandled MRK");
+    }
+    return getSimpleRdx(Instruction::ICmp);
+  }
+  case RecurrenceDescriptor::RK_FloatMinMax: {
+    Flags.IsMaxOp =
+        Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax;
+    return getSimpleRdx(Instruction::FCmp);
+  }
+  default:
+    llvm_unreachable("Unhandled RecKind");
+  }
+}
+
+void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+  if (auto *VecOp = dyn_cast<Instruction>(I)) {
+    if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
+      // VecOVp is initialized to the 0th scalar, so start counting from index
+      // '1'.
+      VecOp->copyIRFlags(I0);
+      for (int i = 1, e = VL.size(); i < e; ++i) {
+        if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
+          VecOp->andIRFlags(Scalar);
+      }
+    }
+  }
+}
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index 29d334f2968f..2ef3d6336ae2 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -35,7 +35,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
     // Upgrade a 2-field global array type to the new 3-field format if needed.
     if (Data && OldEltTy->getNumElements() < 3)
       EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
-                              IRB.getInt8PtrTy(), nullptr);
+                              IRB.getInt8PtrTy());
     else
       EltTy = OldEltTy;
     if (Constant *Init = GVCtor->getInitializer()) {
@@ -44,10 +44,10 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
       for (unsigned i = 0; i != n; ++i) {
         auto Ctor = cast<Constant>(Init->getOperand(i));
         if (EltTy != OldEltTy)
-          Ctor = ConstantStruct::get(
-              EltTy, Ctor->getAggregateElement((unsigned)0),
-              Ctor->getAggregateElement(1),
-              Constant::getNullValue(IRB.getInt8PtrTy()), nullptr);
+          Ctor =
+              ConstantStruct::get(EltTy, Ctor->getAggregateElement((unsigned)0),
+                                  Ctor->getAggregateElement(1),
+                                  Constant::getNullValue(IRB.getInt8PtrTy()));
         CurrentCtors.push_back(Ctor);
       }
     }
@@ -55,7 +55,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
   } else {
     // Use the new three-field struct if there isn't one already.
     EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
-                            IRB.getInt8PtrTy(), nullptr);
+                            IRB.getInt8PtrTy());
   }
 
   // Build a 2 or 3 field global_ctor entry.  We don't take a comdat key.
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 9e71d746de34..1de579ed41b0 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1450,11 +1450,11 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
     // x86_64 can't use {float, float} since that would be returned in both
     // xmm0 and xmm1, which isn't what a real struct would do.
     ResTy = T.getArch() == Triple::x86_64
-    ? static_cast<Type *>(VectorType::get(ArgTy, 2))
-    : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr));
+                ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+                : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
   } else {
     Name = "__sincospi_stret";
-    ResTy = StructType::get(ArgTy, ArgTy, nullptr);
+    ResTy = StructType::get(ArgTy, ArgTy);
   }
 
   Module *M = OrigCallee->getParent();
diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp
index 83bd29dbca65..60d9ede2c487 100644
--- a/lib/Transforms/Utils/VNCoercion.cpp
+++ b/lib/Transforms/Utils/VNCoercion.cpp
@@ -303,6 +303,15 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
                                      const DataLayout &DL) {
   LLVMContext &Ctx = SrcVal->getType()->getContext();
 
+  // If two pointers are in the same address space, they have the same size,
+  // so we don't need to do any truncation, etc. This avoids introducing
+  // ptrtoint instructions for pointers that may be non-integral.
+  if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() &&
+      cast<PointerType>(SrcVal->getType())->getAddressSpace() ==
+          cast<PointerType>(LoadTy)->getAddressSpace()) {
+    return SrcVal;
+  }
+
   uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
   uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
   // Compute which bits of the stored value are being used by the load.  Convert
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 84d89f103a2f..930972924c3c 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -949,11 +949,10 @@ void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
     Constant *NewV;
     if (IsOldCtorDtor) {
       auto *S = cast<ConstantStruct>(V);
-      auto *E1 = mapValue(S->getOperand(0));
-      auto *E2 = mapValue(S->getOperand(1));
-      Value *Null = Constant::getNullValue(VoidPtrTy);
-      NewV =
-          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+      auto *E1 = cast<Constant>(mapValue(S->getOperand(0)));
+      auto *E2 = cast<Constant>(mapValue(S->getOperand(1)));
+      Constant *Null = Constant::getNullValue(VoidPtrTy);
+      NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);
     } else {
       NewV = cast_or_null<Constant>(mapValue(V));
     }
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 97dcb40a1d72..9cf66382b581 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -346,7 +346,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
   if (!Safe) {
     KnownBits Known(BitWidth);
     computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
-    if (Known.Zero.countTrailingZeros() < (BitWidth - 1))
+    if (Known.countMaxTrailingOnes() < (BitWidth - 1))
       Safe = true;
   }
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3fde0a453962..516ab7d03a88 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -391,13 +391,14 @@ public:
         TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
         AddedSafetyChecks(false) {}
 
-  // Perform the actual loop widening (vectorization).
-  void vectorize() {
-    // Create a new empty loop. Unlink the old loop and connect the new one.
-    createEmptyLoop();
-    // Widen each instruction in the old loop to a new one in the new loop.
-    vectorizeLoop();
-  }
+  /// Create a new empty loop. Unlink the old loop and connect the new one.
+  void createVectorizedLoopSkeleton();
+
+  /// Vectorize a single instruction within the innermost loop.
+  void vectorizeInstruction(Instruction &I);
+
+  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
+  void fixVectorizedLoop();
 
   // Return true if any runtime check is added.
   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
@@ -425,9 +426,6 @@ protected:
       EdgeMaskCacheTy;
   typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
 
-  /// Create an empty loop, based on the loop ranges of the old loop.
-  void createEmptyLoop();
-
   /// Set up the values of the IVs correctly when exiting the vector loop.
   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
                     Value *CountRoundDown, Value *EndValue,
@@ -436,8 +434,6 @@ protected:
   /// Create a new induction variable inside L.
   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
                                    Value *Step, Instruction *DL);
-  /// Copy and widen the instructions from the old loop.
-  virtual void vectorizeLoop();
 
   /// Handle all cross-iteration phis in the header.
   void fixCrossIterationPHIs();
@@ -450,10 +446,10 @@ protected:
   /// vectorizing this phi node.
   void fixReduction(PHINode *Phi);
 
-  /// \brief The Loop exit block may have single value PHI nodes where the
-  /// incoming value is 'Undef'. While vectorizing we only handled real values
-  /// that were defined inside the loop. Here we fix the 'undef case'.
-  /// See PR14725.
+  /// \brief The Loop exit block may have single value PHI nodes with some
+  /// incoming value. While vectorizing we only handled real values
+  /// that were defined inside the loop and we should have one value for
+  /// each predecessor of its parent basic block. See PR14725.
   void fixLCSSAPHIs();
 
   /// Iteratively sink the scalarized operands of a predicated instruction into
@@ -464,11 +460,6 @@ protected:
   /// respective conditions.
   void predicateInstructions();
 
-  /// Collect the instructions from the original loop that would be trivially
-  /// dead in the vectorized loop if generated.
-  void collectTriviallyDeadInstructions(
-      SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
   /// represented as.
   void truncateToMinimalBitwidths();
@@ -481,10 +472,6 @@ protected:
   /// and DST.
   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
 
-  /// A helper function to vectorize a single instruction within the innermost
-  /// loop.
-  void vectorizeInstruction(Instruction &I);
-
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
@@ -1700,6 +1687,9 @@ public:
   /// access that can be widened.
   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
 
+  // Returns true if the NoNaN attribute is set on the function.
+  bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -2185,7 +2175,10 @@ public:
 /// passed Legality checks.
 class LoopVectorizationPlanner {
 public:
-  LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {}
+  LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
+                           LoopVectorizationLegality *Legal,
+                           LoopVectorizationCostModel &CM)
+      : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
 
   ~LoopVectorizationPlanner() {}
 
@@ -2193,7 +2186,25 @@ public:
   LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
                                                        unsigned UserVF);
 
+  /// Generate the IR code for the vectorized loop.
+  void executePlan(InnerLoopVectorizer &ILV);
+
+protected:
+  /// Collect the instructions from the original loop that would be trivially
+  /// dead in the vectorized loop if generated.
+  void collectTriviallyDeadInstructions(
+      SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
 private:
+  /// The loop that we evaluate.
+  Loop *OrigLoop;
+
+  /// Loop Info analysis.
+  LoopInfo *LI;
+
+  /// The legality analysis.
+  LoopVectorizationLegality *Legal;
+
   /// The profitablity analysis.
   LoopVectorizationCostModel &CM;
 };
@@ -3361,7 +3372,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   LVer->prepareNoAliasMetadata();
 }
 
-void InnerLoopVectorizer::createEmptyLoop() {
+void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
@@ -3883,36 +3894,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
   }
 }
 
-void InnerLoopVectorizer::vectorizeLoop() {
-  //===------------------------------------------------===//
-  //
-  // Notice: any optimization or new instruction that go
-  // into the code below should be also be implemented in
-  // the cost-model.
-  //
-  //===------------------------------------------------===//
-
-  // Collect instructions from the original loop that will become trivially dead
-  // in the vectorized loop. We don't need to vectorize these instructions. For
-  // example, original induction update instructions can become dead because we
-  // separately emit induction "steps" when generating code for the new loop.
-  // Similarly, we create a new latch condition when setting up the structure
-  // of the new loop, so the old one can become dead.
-  SmallPtrSet<Instruction *, 4> DeadInstructions;
-  collectTriviallyDeadInstructions(DeadInstructions);
-
-  // Scan the loop in a topological order to ensure that defs are vectorized
-  // before users.
-  LoopBlocksDFS DFS(OrigLoop);
-  DFS.perform(LI);
-
-  // Vectorize all instructions in the original loop that will not become
-  // trivially dead when vectorized.
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
-    for (Instruction &I : *BB)
-      if (!DeadInstructions.count(&I))
-        vectorizeInstruction(I);
-
+void InnerLoopVectorizer::fixVectorizedLoop() {
   // Insert truncates and extends for any truncated instructions as hints to
   // InstCombine.
   if (VF > 1)
@@ -4049,8 +4031,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // Set the insertion point after the previous value if it is an instruction.
   // Note that the previous value may have been constant-folded so it is not
-  // guaranteed to be an instruction in the vector loop.
-  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]))
+  // guaranteed to be an instruction in the vector loop. Also, if the previous
+  // value is a phi node, we should insert after all the phi nodes to avoid
+  // breaking basic block verification.
+  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) ||
+      isa<PHINode>(PreviousParts[UF - 1]))
     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
   else
     Builder.SetInsertPoint(
@@ -4258,39 +4243,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   }
 
   if (VF > 1) {
-    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
-    // and vector ops, reducing the set of values being computed by half each
-    // round.
-    assert(isPowerOf2_32(VF) &&
-           "Reduction emission only supported for pow2 vectors!");
-    Value *TmpVec = ReducedPartRdx;
-    SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
-    for (unsigned i = VF; i != 1; i >>= 1) {
-      // Move the upper half of the vector to the lower half.
-      for (unsigned j = 0; j != i / 2; ++j)
-        ShuffleMask[j] = Builder.getInt32(i / 2 + j);
-
-      // Fill the rest of the mask with undef.
-      std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
-                UndefValue::get(Builder.getInt32Ty()));
-
-      Value *Shuf = Builder.CreateShuffleVector(
-          TmpVec, UndefValue::get(TmpVec->getType()),
-          ConstantVector::get(ShuffleMask), "rdx.shuf");
-
-      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        // Floating point operations had to be 'fast' to enable the reduction.
-        TmpVec = addFastMathFlag(Builder.CreateBinOp(
-                                     (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
-      else
-        TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
-                                                      TmpVec, Shuf);
-    }
-
-    // The result is in the first element of the vector.
+    bool NoNaN = Legal->hasFunNoNaNAttr();
     ReducedPartRdx =
-      Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
-
+        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
     // If the reduction can be performed in a smaller type, we need to extend
     // the reduction to the wider type before we branch to the original loop.
     if (Phi->getType() != RdxDesc.getRecurrenceType())
@@ -4345,33 +4300,11 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
     auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
     if (!LCSSAPhi)
       break;
-    if (LCSSAPhi->getNumIncomingValues() == 1)
-      LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
-                            LoopMiddleBlock);
-  }
-}
-
-void InnerLoopVectorizer::collectTriviallyDeadInstructions(
-    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
-  BasicBlock *Latch = OrigLoop->getLoopLatch();
-
-  // We create new control-flow for the vectorized loop, so the original
-  // condition will be dead after vectorization if it's only used by the
-  // branch.
-  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
-  if (Cmp && Cmp->hasOneUse())
-    DeadInstructions.insert(Cmp);
-
-  // We create new "steps" for induction variable updates to which the original
-  // induction variables map. An original update instruction will be dead if
-  // all its users except the induction variable are dead.
-  for (auto &Induction : *Legal->getInductionVars()) {
-    PHINode *Ind = Induction.first;
-    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
-    if (all_of(IndUpdate->users(), [&](User *U) -> bool {
-          return U == Ind || DeadInstructions.count(cast<Instruction>(U));
-        }))
-      DeadInstructions.insert(IndUpdate);
+    if (LCSSAPhi->getNumIncomingValues() == 1) {
+      assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
+             "Incoming value isn't loop invariant");
+      LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
+    }
   }
 }
 
@@ -7577,6 +7510,72 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
   return CM.selectVectorizationFactor(MaxVF);
 }
 
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
+  // Perform the actual loop transformation.
+
+  // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+  ILV.createVectorizedLoopSkeleton();
+
+  //===------------------------------------------------===//
+  //
+  // Notice: any optimization or new instruction that go
+  // into the code below should also be implemented in
+  // the cost-model.
+  //
+  //===------------------------------------------------===//
+
+  // 2. Copy and widen instructions from the old loop into the new loop.
+
+  // Collect instructions from the original loop that will become trivially dead
+  // in the vectorized loop. We don't need to vectorize these instructions. For
+  // example, original induction update instructions can become dead because we
+  // separately emit induction "steps" when generating code for the new loop.
+  // Similarly, we create a new latch condition when setting up the structure
+  // of the new loop, so the old one can become dead.
+  SmallPtrSet<Instruction *, 4> DeadInstructions;
+  collectTriviallyDeadInstructions(DeadInstructions);
+
+  // Scan the loop in a topological order to ensure that defs are vectorized
+  // before users.
+  LoopBlocksDFS DFS(OrigLoop);
+  DFS.perform(LI);
+
+  // Vectorize all instructions in the original loop that will not become
+  // trivially dead when vectorized.
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+    for (Instruction &I : *BB)
+      if (!DeadInstructions.count(&I))
+        ILV.vectorizeInstruction(I);
+
+  // 3. Fix the vectorized code: take care of header phi's, live-outs,
+  //    predication, updating analyses.
+  ILV.fixVectorizedLoop();
+}
+
+void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
+    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+  BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+  // We create new control-flow for the vectorized loop, so the original
+  // condition will be dead after vectorization if it's only used by the
+  // branch.
+  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+  if (Cmp && Cmp->hasOneUse())
+    DeadInstructions.insert(Cmp);
+
+  // We create new "steps" for induction variable updates to which the original
+  // induction variables map. An original update instruction will be dead if
+  // all its users except the induction variable are dead.
+  for (auto &Induction : *Legal->getInductionVars()) {
+    PHINode *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+    if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+          return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+        }))
+      DeadInstructions.insert(IndUpdate);
+  }
+}
+
 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
   auto *SI = dyn_cast<StoreInst>(Instr);
   bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
@@ -7759,7 +7758,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   CM.collectValuesToIgnore();
 
   // Use the planner for vectorization.
-  LoopVectorizationPlanner LVP(CM);
+  LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
 
   // Get user vectorization factor.
   unsigned UserVF = Hints.getWidth();
@@ -7853,7 +7852,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // interleave it.
     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                                &CM);
-    Unroller.vectorize();
+    LVP.executePlan(Unroller);
 
     ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
                                  L->getHeader())
@@ -7863,7 +7862,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // If we decided that it is *legal* to vectorize the loop, then do it.
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
                            &LVL, &CM);
-    LB.vectorize();
+    LVP.executePlan(LB);
     ++LoopsVectorized;
 
     // Add metadata to disable runtime unrolling a scalar loop when there are
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f112c555205c..64013d6d687d 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -40,7 +40,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <memory>
@@ -212,23 +214,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
   return Opcode;
 }
 
-/// Get the intersection (logical and) of all of the potential IR flags
-/// of each scalar operation (VL) that will be converted into a vector (I).
-/// Flag set: NSW, NUW, exact, and all of fast-math.
-static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
-  if (auto *VecOp = dyn_cast<Instruction>(I)) {
-    if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
-      // VecOVp is initialized to the 0th scalar, so start counting from index
-      // '1'.
-      VecOp->copyIRFlags(I0);
-      for (int i = 1, e = VL.size(); i < e; ++i) {
-        if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
-          VecOp->andIRFlags(Scalar);
-      }
-    }
-  }
-}
-
 /// \returns true if all of the values in \p VL have the same type or false
 /// otherwise.
 static bool allSameType(ArrayRef<Value *> VL) {
@@ -315,10 +300,10 @@ public:
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
-          const DataLayout *DL)
+          const DataLayout *DL, OptimizationRemarkEmitter *ORE)
       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
         SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
-        DL(DL), Builder(Se->getContext()) {
+        DL(DL), ORE(ORE), Builder(Se->getContext()) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
     // Use the vector register size specified by the target unless overridden
     // by a command-line option.
@@ -331,7 +316,10 @@ public:
     else
       MaxVecRegSize = TTI->getRegisterBitWidth(true);
 
-    MinVecRegSize = MinVectorRegSizeOption;
+    if (MinVectorRegSizeOption.getNumOccurrences())
+      MinVecRegSize = MinVectorRegSizeOption;
+    else
+      MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
   }
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
@@ -377,6 +365,8 @@ public:
     MinBWs.clear();
   }
 
+  unsigned getTreeSize() const { return VectorizableTree.size(); }
+
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
@@ -415,6 +405,8 @@ public:
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable();
 
+  OptimizationRemarkEmitter *getORE() { return ORE; }
+
 private:
   struct TreeEntry;
 
@@ -944,6 +936,8 @@ private:
   AssumptionCache *AC;
   DemandedBits *DB;
   const DataLayout *DL;
+  OptimizationRemarkEmitter *ORE;
+
   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
   /// Instruction builder to construct the vectorized tree.
@@ -1835,11 +1829,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
           CInt->getValue().isPowerOf2())
         Op2VP = TargetTransformInfo::OP_PowerOf2;
 
-      int ScalarCost = VecTy->getNumElements() *
-                       TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
-                                                   Op2VK, Op1VP, Op2VP);
+      SmallVector<const Value *, 4> Operands(VL0->operand_values());
+      int ScalarCost =
+          VecTy->getNumElements() *
+          TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
+                                      Op2VP, Operands);
       int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
-                                                Op1VP, Op2VP);
+                                                Op1VP, Op2VP, Operands);
       return VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -3703,10 +3699,8 @@ void BoUpSLP::computeMinimumValueSizes() {
     // Determine if the sign bit of all the roots is known to be zero. If not,
     // IsKnownPositive is set to False.
     IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
-      bool KnownZero = false;
-      bool KnownOne = false;
-      ComputeSignBit(R, KnownZero, KnownOne, *DL);
-      return KnownZero;
+      KnownBits Known = computeKnownBits(R, *DL);
+      return Known.isNonNegative();
     });
 
     // Determine the maximum number of bits required to store the scalar
@@ -3786,8 +3780,9 @@ struct SLPVectorizer : public FunctionPass {
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
-    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3799,6 +3794,7 @@ struct SLPVectorizer : public FunctionPass {
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<DemandedBitsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
@@ -3817,8 +3813,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
   auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+  auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   if (!Changed)
     return PreservedAnalyses::all();
 
@@ -3833,7 +3830,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
                                 TargetTransformInfo *TTI_,
                                 TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
                                 LoopInfo *LI_, DominatorTree *DT_,
-                                AssumptionCache *AC_, DemandedBits *DB_) {
+                                AssumptionCache *AC_, DemandedBits *DB_,
+                                OptimizationRemarkEmitter *ORE_) {
   SE = SE_;
   TTI = TTI_;
   TLI = TLI_;
@@ -3861,7 +3859,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 
   // Use the bottom up slp vectorizer to construct chains that start with
   // store instructions.
-  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
+  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
 
   // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
   // delete instructions.
@@ -3950,6 +3948,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
     if (Cost < -SLPCostThreshold) {
       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      using namespace ore;
+      R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+                                          cast<StoreInst>(Chain[i]))
+                       << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+                       << " and with tree size "
+                       << NV("TreeSize", R.getTreeSize()));
+
       R.vectorizeTree();
 
       // Move to the next bundle.
@@ -4163,6 +4168,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
       if (Cost < -SLPCostThreshold) {
         DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+        R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+                                            cast<Instruction>(Ops[0]))
+                         << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+                         << " and with tree size "
+                         << ore::NV("TreeSize", R.getTreeSize()));
+
         Value *VectorizedRoot = R.vectorizeTree();
 
         // Reconstruct the build vector by extracting the vectorized root. This
@@ -4506,6 +4517,12 @@ public:
 
       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
                    << ". (HorRdx)\n");
+      auto *I0 = cast<Instruction>(VL[0]);
+      V.getORE()->emit(
+          OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+          << "Vectorized horizontal reduction with cost "
+          << ore::NV("Cost", Cost) << " and with tree size "
+          << ore::NV("TreeSize", V.getTreeSize()));
 
       // Vectorize a tree.
       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
@@ -4513,7 +4530,7 @@ public:
 
       // Emit a reduction.
       Value *ReducedSubTree =
-          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);
+          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
       if (VectorizedTree) {
         Builder.SetCurrentDebugLocation(Loc);
         VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
@@ -4583,33 +4600,31 @@ private:
 
   /// \brief Emit a horizontal reduction of the vectorized value.
   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
-                       unsigned ReduxWidth, ArrayRef<Value *> RedOps) {
+                       unsigned ReduxWidth, ArrayRef<Value *> RedOps,
+                       const TargetTransformInfo *TTI) {
     assert(VectorizedValue && "Need to have a vectorized tree node");
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
 
+    if (!IsPairwiseReduction)
+      return createSimpleTargetReduction(
+          Builder, TTI, ReductionOpcode, VectorizedValue,
+          TargetTransformInfo::ReductionFlags(), RedOps);
+
     Value *TmpVec = VectorizedValue;
     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
-      if (IsPairwiseReduction) {
-        Value *LeftMask =
+      Value *LeftMask =
           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
-        Value *RightMask =
+      Value *RightMask =
           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
 
-        Value *LeftShuf = Builder.CreateShuffleVector(
+      Value *LeftShuf = Builder.CreateShuffleVector(
           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
-        Value *RightShuf = Builder.CreateShuffleVector(
+      Value *RightShuf = Builder.CreateShuffleVector(
           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
           "rdx.shuf.r");
-        TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf,
-                                     "bin.rdx");
-      } else {
-        Value *UpperHalf =
-          createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
-        Value *Shuf = Builder.CreateShuffleVector(
-          TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
-        TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");
-      }
+      TmpVec =
+          Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
       propagateIRFlags(TmpVec, RedOps);
     }
 
@@ -5162,6 +5177,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
 
 namespace llvm {
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
index d2984697c8a9..6677063f944f 100644
--- a/lib/XRay/Trace.cpp
+++ b/lib/XRay/Trace.cpp
@@ -115,6 +115,7 @@ struct FDRState {
   uint16_t CPUId;
   uint16_t ThreadId;
   uint64_t BaseTSC;
+
   /// Encode some of the state transitions for the FDR log reader as explicit
   /// checks. These are expectations for the next Record in the stream.
   enum class Token {
@@ -123,8 +124,10 @@ struct FDRState {
     NEW_CPU_ID_RECORD,
     FUNCTION_SEQUENCE,
     SCAN_TO_END_OF_THREAD_BUF,
+    CUSTOM_EVENT_DATA,
   };
   Token Expects;
+
   // Each threads buffer may have trailing garbage to scan over, so we track our
   // progress.
   uint64_t CurrentBufferSize;
@@ -143,6 +146,8 @@ Twine fdrStateToTwine(const FDRState::Token &state) {
     return "FUNCTION_SEQUENCE";
   case FDRState::Token::SCAN_TO_END_OF_THREAD_BUF:
     return "SCAN_TO_END_OF_THREAD_BUF";
+  case FDRState::Token::CUSTOM_EVENT_DATA:
+    return "CUSTOM_EVENT_DATA";
   }
   return "UNKNOWN";
 }
@@ -212,13 +217,32 @@ Error processFDRWallTimeRecord(FDRState &State, uint8_t RecordFirstByte,
   return Error::success();
 }
 
+/// State transition when a CustomEventMarker is encountered.
+Error processCustomEventMarker(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor,
+                               size_t &RecordSize) {
+  // We can encounter a CustomEventMarker anywhere in the log, so we can handle
+  // it regardless of the expectation. However, we do se the expectation to read
+  // a set number of fixed bytes, as described in the metadata.
+  uint32_t OffsetPtr = 1; // Read after the first byte.
+  uint32_t DataSize = RecordExtractor.getU32(&OffsetPtr);
+  uint64_t TSC = RecordExtractor.getU64(&OffsetPtr);
+
+  // FIXME: Actually represent the record through the API. For now we only skip
+  // through the data.
+  (void)TSC;
+  RecordSize = 16 + DataSize;
+  return Error::success();
+}
+
 /// Advances the state machine for reading the FDR record type by reading one
 /// Metadata Record and updating the State appropriately based on the kind of
 /// record encountered. The RecordKind is encoded in the first byte of the
 /// Record, which the caller should pass in because they have already read it
 /// to determine that this is a metadata record as opposed to a function record.
 Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
-                               DataExtractor &RecordExtractor) {
+                               DataExtractor &RecordExtractor,
+                               size_t &RecordSize) {
   // The remaining 7 bits are the RecordKind enum.
   uint8_t RecordKind = RecordFirstByte >> 1;
   switch (RecordKind) {
@@ -247,6 +271,11 @@ Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
             processFDRWallTimeRecord(State, RecordFirstByte, RecordExtractor))
       return E;
     break;
+  case 5: // CustomEventMarker
+    if (auto E = processCustomEventMarker(State, RecordFirstByte,
+                                          RecordExtractor, RecordSize))
+      return E;
+    break;
   default:
     // Widen the record type to uint16_t to prevent conversion to char.
     return make_error<StringError>(
@@ -400,7 +429,8 @@ Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader,
     bool isMetadataRecord = BitField & 0x01uL;
     if (isMetadataRecord) {
       RecordSize = 16;
-      if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor))
+      if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor,
+                                            RecordSize))
         return E;
       State.CurrentBufferConsumed += RecordSize;
     } else { // Process Function Record