336 files changed, 8803 insertions, 4902 deletions
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index ee17ad3ba5863..4dfa25490d00d 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -218,8 +218,8 @@ bool AliasSet::aliasesUnknownInst(const Instruction *Inst,
     return false;
 
   for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
-    if (auto *Inst = getUnknownInst(i)) {
-      ImmutableCallSite C1(Inst), C2(Inst);
+    if (auto *UnknownInst = getUnknownInst(i)) {
+      ImmutableCallSite C1(UnknownInst), C2(Inst);
       if (!C1 || !C2 || AA.getModRefInfo(C1, C2) != MRI_NoModRef ||
           AA.getModRefInfo(C2, C1) != MRI_NoModRef)
         return true;
diff --git a/lib/Analysis/AssumptionCache.cpp b/lib/Analysis/AssumptionCache.cpp
index 0468c794e81dd..3ff27890dc385 100644
--- a/lib/Analysis/AssumptionCache.cpp
+++ b/lib/Analysis/AssumptionCache.cpp
@@ -84,18 +84,11 @@ void AssumptionCache::updateAffectedValues(CallInst *CI) {
         Value *B;
         ConstantInt *C;
         // (A & B) or (A | B) or (A ^ B).
-        if (match(V,
-                  m_CombineOr(m_And(m_Value(A), m_Value(B)),
-                    m_CombineOr(m_Or(m_Value(A), m_Value(B)),
-                                m_Xor(m_Value(A), m_Value(B)))))) {
+        if (match(V, m_BitwiseLogic(m_Value(A), m_Value(B)))) {
           AddAffected(A);
           AddAffected(B);
         // (A << C) or (A >>_s C) or (A >>_u C) where C is some constant.
-        } else if (match(V,
-                         m_CombineOr(m_Shl(m_Value(A), m_ConstantInt(C)),
-                           m_CombineOr(m_LShr(m_Value(A), m_ConstantInt(C)),
-                                       m_AShr(m_Value(A),
-                                              m_ConstantInt(C)))))) {
+        } else if (match(V, m_Shift(m_Value(A), m_ConstantInt(C)))) {
           AddAffected(A);
         }
       };
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index dbb1b01b94ac2..b52a1d7b24d62 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1021,11 +1021,14 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
         // asking about values from different loop iterations. See PR32314.
         // TODO: We may be able to change the check so we only do this when
         // we definitely looked through a PHINode.
-        KnownBits Known1 = computeKnownBits(GEP1LastIdx, DL);
-        KnownBits Known2 = computeKnownBits(GEP2LastIdx, DL);
-        if (Known1.Zero.intersects(Known2.One) ||
-            Known1.One.intersects(Known2.Zero))
-          return NoAlias;
+        if (GEP1LastIdx != GEP2LastIdx &&
+            GEP1LastIdx->getType() == GEP2LastIdx->getType()) {
+          KnownBits Known1 = computeKnownBits(GEP1LastIdx, DL);
+          KnownBits Known2 = computeKnownBits(GEP2LastIdx, DL);
+          if (Known1.Zero.intersects(Known2.One) ||
+              Known1.One.intersects(Known2.Zero))
+            return NoAlias;
+        }
       } else if (isKnownNonEqual(GEP1LastIdx, GEP2LastIdx, DL))
         return NoAlias;
     }
@@ -1345,11 +1348,7 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
   // Statically, we can see that the base objects are the same, but the
   // pointers have dynamic offsets which we can't resolve. And none of our
   // little tricks above worked.
-  //
-  // TODO: Returning PartialAlias instead of MayAlias is a mild hack; the
-  // practical effect of this is protecting TBAA in the case of dynamic
-  // indices into arrays of unions or malloc'd memory.
-  return PartialAlias;
+  return MayAlias;
 }
 
 static AliasResult MergeAliasResults(AliasResult A, AliasResult B) {
diff --git a/lib/Analysis/CFLSteensAliasAnalysis.cpp b/lib/Analysis/CFLSteensAliasAnalysis.cpp
index dde24ef5fdd57..6e4263920e586 100644
--- a/lib/Analysis/CFLSteensAliasAnalysis.cpp
+++ b/lib/Analysis/CFLSteensAliasAnalysis.cpp
@@ -80,9 +80,6 @@ public:
   const AliasSummary &getAliasSummary() const { return Summary; }
 };
 
-/// Try to go from a Value* to a Function*. Never returns nullptr.
-static Optional<Function *> parentFunctionOfValue(Value *);
-
 const StratifiedIndex StratifiedLink::SetSentinel =
     std::numeric_limits<StratifiedIndex>::max();
 
diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp
index 8f808f3e78719..926b28d6094a5 100644
--- a/lib/Analysis/DemandedBits.cpp
+++ b/lib/Analysis/DemandedBits.cpp
@@ -107,6 +107,8 @@ void DemandedBits::determineLiveOperandBits(
         AB = AOut.byteSwap();
         break;
       case Intrinsic::bitreverse:
+        // The alive bits of the input are the reversed alive bits of
+        // the output.
         AB = AOut.reverseBits();
         break;
       case Intrinsic::ctlz:
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 6ff5938a3175a..77ad6f1e166fd 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -1022,12 +1022,15 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   // inlining those. It will prevent inlining in cases where the optimization
   // does not (yet) fire.
 
+  // Maximum valid cost increased in this function.
+  int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
+
   // Exit early for a large switch, assuming one case needs at least one
   // instruction.
   // FIXME: This is not true for a bit test, but ignore such case for now to
   // save compile-time.
   int64_t CostLowerBound =
-      std::min((int64_t)INT_MAX,
+      std::min((int64_t)CostUpperBound,
                (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
 
   if (CostLowerBound > Threshold) {
@@ -1044,7 +1047,8 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   if (JumpTableSize) {
     int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
                      4 * InlineConstants::InstrCost;
-    Cost = std::min((int64_t)INT_MAX, JTCost + Cost);
+
+    Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
     return false;
   }
 
@@ -1068,10 +1072,12 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
     Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
     return false;
   }
-  int64_t ExpectedNumberOfCompare = 3 * (uint64_t)NumCaseCluster / 2 - 1;
-  uint64_t SwitchCost =
+
+  int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
+  int64_t SwitchCost =
       ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
-  Cost = std::min((uint64_t)INT_MAX, SwitchCost + Cost);
+
+  Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
   return false;
 }
 
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index a975be79619b7..d9e32a3c417e0 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -2688,16 +2688,14 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
   }
 
   // icmp pred (and X, Y), X
-  if (LBO && match(LBO, m_CombineOr(m_And(m_Value(), m_Specific(RHS)),
-                                    m_And(m_Specific(RHS), m_Value())))) {
+  if (LBO && match(LBO, m_c_And(m_Value(), m_Specific(RHS)))) {
     if (Pred == ICmpInst::ICMP_UGT)
       return getFalse(ITy);
     if (Pred == ICmpInst::ICMP_ULE)
       return getTrue(ITy);
   }
   // icmp pred X, (and X, Y)
-  if (RBO && match(RBO, m_CombineOr(m_And(m_Value(), m_Specific(LHS)),
-                                    m_And(m_Specific(LHS), m_Value())))) {
+  if (RBO && match(RBO, m_c_And(m_Value(), m_Specific(LHS)))) {
     if (Pred == ICmpInst::ICMP_UGE)
       return getTrue(ITy);
     if (Pred == ICmpInst::ICMP_ULT)
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 3ed61a79478ad..102081e721ac6 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -1324,12 +1324,12 @@ getValueFromConditionImpl(Value *Val, Value *Cond, bool isTrueDest,
     return getValueFromICmpCondition(Val, ICI, isTrueDest);
 
   // Handle conditions in the form of (cond1 && cond2), we know that on the
-  // true dest path both of the conditions hold.
-  if (!isTrueDest)
-    return LVILatticeVal::getOverdefined();
-
+  // true dest path both of the conditions hold. Similarly for conditions of
+  // the form (cond1 || cond2), we know that on the false dest path neither
+  // condition holds.
   BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond);
-  if (!BO || BO->getOpcode() != BinaryOperator::And)
+  if (!BO || (isTrueDest && BO->getOpcode() != BinaryOperator::And) ||
+             (!isTrueDest && BO->getOpcode() != BinaryOperator::Or))
     return LVILatticeVal::getOverdefined();
 
   auto RHS = getValueFromCondition(Val, BO->getOperand(0), isTrueDest, Visited);
@@ -1660,6 +1660,26 @@ Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB,
   return nullptr;
 }
 
+ConstantRange LazyValueInfo::getConstantRangeOnEdge(Value *V,
+                                                    BasicBlock *FromBB,
+                                                    BasicBlock *ToBB,
+                                                    Instruction *CxtI) {
+  unsigned Width = V->getType()->getIntegerBitWidth();
+  const DataLayout &DL = FromBB->getModule()->getDataLayout();
+  LVILatticeVal Result =
+      getImpl(PImpl, AC, &DL, DT).getValueOnEdge(V, FromBB, ToBB, CxtI);
+
+  if (Result.isUndefined())
+    return ConstantRange(Width, /*isFullSet=*/false);
+  if (Result.isConstantRange())
+    return Result.getConstantRange();
+  // We represent ConstantInt constants as constant ranges but other kinds
+  // of integer constants, i.e. ConstantExpr will be tagged as constants
+  assert(!(Result.isConstant() && isa<ConstantInt>(Result.getConstant())) &&
+         "ConstantInt value must be represented as constantrange");
+  return ConstantRange(Width, /*isFullSet=*/true);
+}
+
 static LazyValueInfo::Tristate getPredicateResult(unsigned Pred, Constant *C,
                                                   const LVILatticeVal &Val,
                                                   const DataLayout &DL,
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index 96799a459bfc4..591b0fc481d24 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -117,6 +117,16 @@ static bool isDereferenceableAndAlignedPointer(
 }
 
 bool llvm::isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
+                                              const APInt &Size,
+                                              const DataLayout &DL,
+                                              const Instruction *CtxI,
+                                              const DominatorTree *DT) {
+  SmallPtrSet<const Value *, 32> Visited;
+  return ::isDereferenceableAndAlignedPointer(V, Align, Size, DL, CtxI, DT,
+                                              Visited);
+}
+
+bool llvm::isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
                                               const DataLayout &DL,
                                               const Instruction *CtxI,
                                               const DominatorTree *DT) {
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 3fdedbb0ab3c2..263cf42ebe271 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -310,11 +310,11 @@ unsigned MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
 }
 
 static bool isVolatile(Instruction *Inst) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+  if (auto *LI = dyn_cast<LoadInst>(Inst))
     return LI->isVolatile();
-  else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+  if (auto *SI = dyn_cast<StoreInst>(Inst))
     return SI->isVolatile();
-  else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst))
+  if (auto *AI = dyn_cast<AtomicCmpXchgInst>(Inst))
     return AI->isVolatile();
   return false;
 }
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index aebc80a0a8851..73a95ec405c7b 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -126,11 +126,11 @@ static cl::opt<bool>
 static cl::opt<unsigned> MulOpsInlineThreshold(
     "scev-mulops-inline-threshold", cl::Hidden,
     cl::desc("Threshold for inlining multiplication operands into a SCEV"),
-    cl::init(1000));
+    cl::init(32));
 
 static cl::opt<unsigned> AddOpsInlineThreshold(
     "scev-addops-inline-threshold", cl::Hidden,
-    cl::desc("Threshold for inlining multiplication operands into a SCEV"),
+    cl::desc("Threshold for inlining addition operands into a SCEV"),
     cl::init(500));
 
 static cl::opt<unsigned> MaxSCEVCompareDepth(
@@ -1259,12 +1259,12 @@ static const SCEV *getSignedOverflowLimitForStep(const SCEV *Step,
   if (SE->isKnownPositive(Step)) {
     *Pred = ICmpInst::ICMP_SLT;
     return SE->getConstant(APInt::getSignedMinValue(BitWidth) -
-                           SE->getSignedRange(Step).getSignedMax());
+                           SE->getSignedRangeMax(Step));
   }
   if (SE->isKnownNegative(Step)) {
     *Pred = ICmpInst::ICMP_SGT;
     return SE->getConstant(APInt::getSignedMaxValue(BitWidth) -
-                           SE->getSignedRange(Step).getSignedMin());
+                           SE->getSignedRangeMin(Step));
   }
   return nullptr;
 }
@@ -1279,7 +1279,7 @@ static const SCEV *getUnsignedOverflowLimitForStep(const SCEV *Step,
   *Pred = ICmpInst::ICMP_ULT;
 
   return SE->getConstant(APInt::getMinValue(BitWidth) -
-                         SE->getUnsignedRange(Step).getUnsignedMax());
+                         SE->getUnsignedRangeMax(Step));
 }
 
 namespace {
@@ -1670,7 +1670,7 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
         // is safe.
         if (isKnownPositive(Step)) {
           const SCEV *N = getConstant(APInt::getMinValue(BitWidth) -
-                                      getUnsignedRange(Step).getUnsignedMax());
+                                      getUnsignedRangeMax(Step));
           if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, AR, N) ||
               (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_ULT, Start, N) &&
                isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT,
@@ -1686,7 +1686,7 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
           }
         } else if (isKnownNegative(Step)) {
           const SCEV *N = getConstant(APInt::getMaxValue(BitWidth) -
-                                      getSignedRange(Step).getSignedMin());
+                                      getSignedRangeMin(Step));
           if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT, AR, N) ||
               (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_UGT, Start, N) &&
                isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT,
@@ -3745,7 +3745,7 @@ const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
   // makes it so that we cannot make much use of NUW.
   auto AddFlags = SCEV::FlagAnyWrap;
   const bool RHSIsNotMinSigned =
-      !getSignedRange(RHS).getSignedMin().isMinSignedValue();
+      !getSignedRangeMin(RHS).isMinSignedValue();
   if (maskFlags(Flags, SCEV::FlagNSW) == SCEV::FlagNSW) {
     // Let M be the minimum representable signed value. Then (-1)*RHS
     // signed-wraps if and only if RHS is M. That can happen even for
@@ -4758,9 +4758,9 @@ static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
 /// Determine the range for a particular SCEV.  If SignHint is
 /// HINT_RANGE_UNSIGNED (resp. HINT_RANGE_SIGNED) then getRange prefers ranges
 /// with a "cleaner" unsigned (resp. signed) representation.
-ConstantRange
-ScalarEvolution::getRange(const SCEV *S,
-                          ScalarEvolution::RangeSignHint SignHint) {
+const ConstantRange &
+ScalarEvolution::getRangeRef(const SCEV *S,
+                             ScalarEvolution::RangeSignHint SignHint) {
   DenseMap<const SCEV *, ConstantRange> &Cache =
       SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED ? UnsignedRanges
                                                        : SignedRanges;
@@ -4791,54 +4791,54 @@ ScalarEvolution::getRange(const SCEV *S,
   }
 
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
-    ConstantRange X = getRange(Add->getOperand(0), SignHint);
+    ConstantRange X = getRangeRef(Add->getOperand(0), SignHint);
     for (unsigned i = 1, e = Add->getNumOperands(); i != e; ++i)
-      X = X.add(getRange(Add->getOperand(i), SignHint));
+      X = X.add(getRangeRef(Add->getOperand(i), SignHint));
     return setRange(Add, SignHint, ConservativeResult.intersectWith(X));
   }
 
   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
-    ConstantRange X = getRange(Mul->getOperand(0), SignHint);
+    ConstantRange X = getRangeRef(Mul->getOperand(0), SignHint);
     for (unsigned i = 1, e = Mul->getNumOperands(); i != e; ++i)
-      X = X.multiply(getRange(Mul->getOperand(i), SignHint));
+      X = X.multiply(getRangeRef(Mul->getOperand(i), SignHint));
     return setRange(Mul, SignHint, ConservativeResult.intersectWith(X));
   }
 
   if (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(S)) {
-    ConstantRange X = getRange(SMax->getOperand(0), SignHint);
+    ConstantRange X = getRangeRef(SMax->getOperand(0), SignHint);
     for (unsigned i = 1, e = SMax->getNumOperands(); i != e; ++i)
-      X = X.smax(getRange(SMax->getOperand(i), SignHint));
+      X = X.smax(getRangeRef(SMax->getOperand(i), SignHint));
     return setRange(SMax, SignHint, ConservativeResult.intersectWith(X));
   }
 
   if (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(S)) {
-    ConstantRange X = getRange(UMax->getOperand(0), SignHint);
+    ConstantRange X = getRangeRef(UMax->getOperand(0), SignHint);
     for (unsigned i = 1, e = UMax->getNumOperands(); i != e; ++i)
-      X = X.umax(getRange(UMax->getOperand(i), SignHint));
+      X = X.umax(getRangeRef(UMax->getOperand(i), SignHint));
     return setRange(UMax, SignHint, ConservativeResult.intersectWith(X));
   }
 
   if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
-    ConstantRange X = getRange(UDiv->getLHS(), SignHint);
-    ConstantRange Y = getRange(UDiv->getRHS(), SignHint);
+    ConstantRange X = getRangeRef(UDiv->getLHS(), SignHint);
+    ConstantRange Y = getRangeRef(UDiv->getRHS(), SignHint);
     return setRange(UDiv, SignHint,
                     ConservativeResult.intersectWith(X.udiv(Y)));
   }
 
   if (const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(S)) {
-    ConstantRange X = getRange(ZExt->getOperand(), SignHint);
+    ConstantRange X = getRangeRef(ZExt->getOperand(), SignHint);
     return setRange(ZExt, SignHint,
                     ConservativeResult.intersectWith(X.zeroExtend(BitWidth)));
   }
 
   if (const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(S)) {
-    ConstantRange X = getRange(SExt->getOperand(), SignHint);
+    ConstantRange X = getRangeRef(SExt->getOperand(), SignHint);
     return setRange(SExt, SignHint,
                     ConservativeResult.intersectWith(X.signExtend(BitWidth)));
   }
 
   if (const SCEVTruncateExpr *Trunc = dyn_cast<SCEVTruncateExpr>(S)) {
-    ConstantRange X = getRange(Trunc->getOperand(), SignHint);
+    ConstantRange X = getRangeRef(Trunc->getOperand(), SignHint);
     return setRange(Trunc, SignHint,
                     ConservativeResult.intersectWith(X.truncate(BitWidth)));
   }
@@ -5005,8 +5005,7 @@ ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start,
          "Precondition!");
 
   MaxBECount = getNoopOrZeroExtend(MaxBECount, Start->getType());
-  ConstantRange MaxBECountRange = getUnsignedRange(MaxBECount);
-  APInt MaxBECountValue = MaxBECountRange.getUnsignedMax();
+  APInt MaxBECountValue = getUnsignedRangeMax(MaxBECount);
 
   // First, consider step signed.
   ConstantRange StartSRange = getSignedRange(Start);
@@ -5023,7 +5022,7 @@ ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start,
 
   // Next, consider step unsigned.
   ConstantRange UR = getRangeForAffineARHelper(
-      getUnsignedRange(Step).getUnsignedMax(), getUnsignedRange(Start),
+      getUnsignedRangeMax(Step), getUnsignedRange(Start),
       MaxBECountValue, BitWidth, /* Signed = */ false);
 
   // Finally, intersect signed and unsigned ranges.
@@ -6373,7 +6372,7 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
       // to not.
       if (isa<SCEVCouldNotCompute>(MaxBECount) &&
           !isa<SCEVCouldNotCompute>(BECount))
-        MaxBECount = getConstant(getUnsignedRange(BECount).getUnsignedMax());
+        MaxBECount = getConstant(getUnsignedRangeMax(BECount));
 
       return ExitLimit(BECount, MaxBECount, false,
                        {&EL0.Predicates, &EL1.Predicates});
@@ -7647,7 +7646,7 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
   // 1*N = -Start; -1*N = Start (mod 2^BW), so:
   //   N = Distance (as unsigned)
   if (StepC->getValue()->equalsInt(1) || StepC->getValue()->isAllOnesValue()) {
-    APInt MaxBECount = getUnsignedRange(Distance).getUnsignedMax();
+    APInt MaxBECount = getUnsignedRangeMax(Distance);
 
     // When a loop like "for (int i = 0; i != n; ++i) { /* body */ }" is rotated,
     // we end up with a loop whose backedge-taken count is n - 1.  Detect this
@@ -7680,7 +7679,7 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
     const SCEV *Max =
         Exact == getCouldNotCompute()
             ? Exact
-            : getConstant(getUnsignedRange(Exact).getUnsignedMax());
+            : getConstant(getUnsignedRangeMax(Exact));
     return ExitLimit(Exact, Max, false, Predicates);
   }
 
@@ -7689,7 +7688,7 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
                                                getNegativeSCEV(Start), *this);
   const SCEV *M = E == getCouldNotCompute()
                       ? E
-                      : getConstant(getUnsignedRange(E).getUnsignedMax());
+                      : getConstant(getUnsignedRangeMax(E));
   return ExitLimit(E, M, false, Predicates);
 }
 
@@ -7886,12 +7885,12 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
   // adding or subtracting 1 from one of the operands.
   switch (Pred) {
   case ICmpInst::ICMP_SLE:
-    if (!getSignedRange(RHS).getSignedMax().isMaxSignedValue()) {
+    if (!getSignedRangeMax(RHS).isMaxSignedValue()) {
       RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS,
                        SCEV::FlagNSW);
       Pred = ICmpInst::ICMP_SLT;
       Changed = true;
-    } else if (!getSignedRange(LHS).getSignedMin().isMinSignedValue()) {
+    } else if (!getSignedRangeMin(LHS).isMinSignedValue()) {
       LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS,
                        SCEV::FlagNSW);
       Pred = ICmpInst::ICMP_SLT;
@@ -7899,12 +7898,12 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
     }
     break;
   case ICmpInst::ICMP_SGE:
-    if (!getSignedRange(RHS).getSignedMin().isMinSignedValue()) {
+    if (!getSignedRangeMin(RHS).isMinSignedValue()) {
       RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS,
                        SCEV::FlagNSW);
       Pred = ICmpInst::ICMP_SGT;
       Changed = true;
-    } else if (!getSignedRange(LHS).getSignedMax().isMaxSignedValue()) {
+    } else if (!getSignedRangeMax(LHS).isMaxSignedValue()) {
       LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS,
                        SCEV::FlagNSW);
       Pred = ICmpInst::ICMP_SGT;
@@ -7912,23 +7911,23 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
     }
     break;
   case ICmpInst::ICMP_ULE:
-    if (!getUnsignedRange(RHS).getUnsignedMax().isMaxValue()) {
+    if (!getUnsignedRangeMax(RHS).isMaxValue()) {
       RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS,
                        SCEV::FlagNUW);
       Pred = ICmpInst::ICMP_ULT;
       Changed = true;
-    } else if (!getUnsignedRange(LHS).getUnsignedMin().isMinValue()) {
+    } else if (!getUnsignedRangeMin(LHS).isMinValue()) {
       LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS);
       Pred = ICmpInst::ICMP_ULT;
       Changed = true;
     }
     break;
   case ICmpInst::ICMP_UGE:
-    if (!getUnsignedRange(RHS).getUnsignedMin().isMinValue()) {
+    if (!getUnsignedRangeMin(RHS).isMinValue()) {
       RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS);
       Pred = ICmpInst::ICMP_UGT;
       Changed = true;
-    } else if (!getUnsignedRange(LHS).getUnsignedMax().isMaxValue()) {
+    } else if (!getUnsignedRangeMax(LHS).isMaxValue()) {
       LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS,
                        SCEV::FlagNUW);
       Pred = ICmpInst::ICMP_UGT;
@@ -7962,19 +7961,19 @@ trivially_false:
 }
 
 bool ScalarEvolution::isKnownNegative(const SCEV *S) {
-  return getSignedRange(S).getSignedMax().isNegative();
+  return getSignedRangeMax(S).isNegative();
 }
 
 bool ScalarEvolution::isKnownPositive(const SCEV *S) {
-  return getSignedRange(S).getSignedMin().isStrictlyPositive();
+  return getSignedRangeMin(S).isStrictlyPositive();
 }
 
 bool ScalarEvolution::isKnownNonNegative(const SCEV *S) {
-  return !getSignedRange(S).getSignedMin().isNegative();
+  return !getSignedRangeMin(S).isNegative();
 }
 
 bool ScalarEvolution::isKnownNonPositive(const SCEV *S) {
-  return !getSignedRange(S).getSignedMax().isStrictlyPositive();
+  return !getSignedRangeMax(S).isStrictlyPositive();
 }
 
 bool ScalarEvolution::isKnownNonZero(const SCEV *S) {
@@ -8560,7 +8559,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
     // predicate we're interested in folding.
 
     APInt Min = ICmpInst::isSigned(Pred) ?
-        getSignedRange(V).getSignedMin() : getUnsignedRange(V).getUnsignedMin();
+        getSignedRangeMin(V) : getUnsignedRangeMin(V);
 
     if (Min == C->getAPInt()) {
       // Given (V >= Min && V != Min) we conclude V >= (Min + 1).
@@ -9115,19 +9114,17 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
   const SCEV *One = getOne(Stride->getType());
 
   if (IsSigned) {
-    APInt MaxRHS = getSignedRange(RHS).getSignedMax();
+    APInt MaxRHS = getSignedRangeMax(RHS);
     APInt MaxValue = APInt::getSignedMaxValue(BitWidth);
-    APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One))
-                                .getSignedMax();
+    APInt MaxStrideMinusOne = getSignedRangeMax(getMinusSCEV(Stride, One));
 
     // SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow!
     return (std::move(MaxValue) - MaxStrideMinusOne).slt(MaxRHS);
   }
 
-  APInt MaxRHS = getUnsignedRange(RHS).getUnsignedMax();
+  APInt MaxRHS = getUnsignedRangeMax(RHS);
   APInt MaxValue = APInt::getMaxValue(BitWidth);
-  APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One))
-                              .getUnsignedMax();
+  APInt MaxStrideMinusOne = getUnsignedRangeMax(getMinusSCEV(Stride, One));
 
   // UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow!
   return (std::move(MaxValue) - MaxStrideMinusOne).ult(MaxRHS);
@@ -9141,19 +9138,17 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
   const SCEV *One = getOne(Stride->getType());
 
   if (IsSigned) {
-    APInt MinRHS = getSignedRange(RHS).getSignedMin();
+    APInt MinRHS = getSignedRangeMin(RHS);
     APInt MinValue = APInt::getSignedMinValue(BitWidth);
-    APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One))
-                               .getSignedMax();
+    APInt MaxStrideMinusOne = getSignedRangeMax(getMinusSCEV(Stride, One));
 
     // SMinRHS - SMaxStrideMinusOne < SMinValue => overflow!
     return (std::move(MinValue) + MaxStrideMinusOne).sgt(MinRHS);
   }
 
-  APInt MinRHS = getUnsignedRange(RHS).getUnsignedMin();
+  APInt MinRHS = getUnsignedRangeMin(RHS);
   APInt MinValue = APInt::getMinValue(BitWidth);
-  APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One))
-                            .getUnsignedMax();
+  APInt MaxStrideMinusOne = getUnsignedRangeMax(getMinusSCEV(Stride, One));
 
   // UMinRHS - UMaxStrideMinusOne < UMinValue => overflow!
   return (std::move(MinValue) + MaxStrideMinusOne).ugt(MinRHS);
@@ -9292,8 +9287,8 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
   } else {
     // Calculate the maximum backedge count based on the range of values
     // permitted by Start, End, and Stride.
-    APInt MinStart = IsSigned ? getSignedRange(Start).getSignedMin()
-                              : getUnsignedRange(Start).getUnsignedMin();
+    APInt MinStart = IsSigned ? getSignedRangeMin(Start)
+                              : getUnsignedRangeMin(Start);
 
     unsigned BitWidth = getTypeSizeInBits(LHS->getType());
 
@@ -9301,8 +9296,8 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
 
     if (PositiveStride)
       StrideForMaxBECount =
-        IsSigned ? getSignedRange(Stride).getSignedMin()
-                 : getUnsignedRange(Stride).getUnsignedMin();
+        IsSigned ? getSignedRangeMin(Stride)
+                 : getUnsignedRangeMin(Stride);
     else
       // Using a stride of 1 is safe when computing max backedge taken count for
       // a loop with unknown stride.
@@ -9316,8 +9311,8 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
     // the case End = RHS. This is safe because in the other case (End - Start)
     // is zero, leading to a zero maximum backedge taken count.
     APInt MaxEnd =
-      IsSigned ? APIntOps::smin(getSignedRange(RHS).getSignedMax(), Limit)
-               : APIntOps::umin(getUnsignedRange(RHS).getUnsignedMax(), Limit);
+      IsSigned ? APIntOps::smin(getSignedRangeMax(RHS), Limit)
+               : APIntOps::umin(getUnsignedRangeMax(RHS), Limit);
 
     MaxBECount = computeBECount(getConstant(MaxEnd - MinStart),
                                 getConstant(StrideForMaxBECount), false);
@@ -9325,7 +9320,7 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
 
   if (isa<SCEVCouldNotCompute>(MaxBECount) &&
       !isa<SCEVCouldNotCompute>(BECount))
-    MaxBECount = getConstant(getUnsignedRange(BECount).getUnsignedMax());
+    MaxBECount = getConstant(getUnsignedRangeMax(BECount));
 
   return ExitLimit(BECount, MaxBECount, MaxOrZero, Predicates);
 }
@@ -9376,11 +9371,11 @@ ScalarEvolution::howManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
 
   const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride, false);
 
-  APInt MaxStart = IsSigned ? getSignedRange(Start).getSignedMax()
-                            : getUnsignedRange(Start).getUnsignedMax();
+  APInt MaxStart = IsSigned ? getSignedRangeMax(Start)
+                            : getUnsignedRangeMax(Start);
 
-  APInt MinStride = IsSigned ? getSignedRange(Stride).getSignedMin()
-                             : getUnsignedRange(Stride).getUnsignedMin();
+  APInt MinStride = IsSigned ? getSignedRangeMin(Stride)
+                             : getUnsignedRangeMin(Stride);
 
   unsigned BitWidth = getTypeSizeInBits(LHS->getType());
   APInt Limit = IsSigned ? APInt::getSignedMinValue(BitWidth) + (MinStride - 1)
@@ -9390,8 +9385,8 @@ ScalarEvolution::howManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
   // the case End = RHS. This is safe because in the other case (Start - End)
   // is zero, leading to a zero maximum backedge taken count.
   APInt MinEnd =
-    IsSigned ? APIntOps::smax(getSignedRange(RHS).getSignedMin(), Limit)
-             : APIntOps::umax(getUnsignedRange(RHS).getUnsignedMin(), Limit);
+    IsSigned ? APIntOps::smax(getSignedRangeMin(RHS), Limit)
+             : APIntOps::umax(getUnsignedRangeMin(RHS), Limit);
 
 
   const SCEV *MaxBECount = getCouldNotCompute();
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index f9b9df2bc707d..47bdac00ae1f3 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -748,18 +748,56 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
   // Emit instructions to mul all the operands. Hoist as much as possible
   // out of loops.
   Value *Prod = nullptr;
-  for (const auto &I : OpsAndLoops) {
-    const SCEV *Op = I.second;
+  auto I = OpsAndLoops.begin();
+
+  // Expand the calculation of X pow N in the following manner:
+  // Let N = P1 + P2 + ... + PK, where all P are powers of 2. Then:
+  // X pow N = (X pow P1) * (X pow P2) * ... * (X pow PK).
+  const auto ExpandOpBinPowN = [this, &I, &OpsAndLoops, &Ty]() {
+    auto E = I;
+    // Calculate how many times the same operand from the same loop is included
+    // into this power.
+    uint64_t Exponent = 0;
+    const uint64_t MaxExponent = UINT64_MAX >> 1;
+    // No one sane will ever try to calculate such huge exponents, but if we
+    // need this, we stop on UINT64_MAX / 2 because we need to exit the loop
+    // below when the power of 2 exceeds our Exponent, and we want it to be
+    // 1u << 31 at most to not deal with unsigned overflow.
+    while (E != OpsAndLoops.end() && *I == *E && Exponent != MaxExponent) {
+      ++Exponent;
+      ++E;
+    }
+    assert(Exponent > 0 && "Trying to calculate a zeroth exponent of operand?");
+
+    // Calculate powers with exponents 1, 2, 4, 8 etc. and include those of them
+    // that are needed into the result.
+    Value *P = expandCodeFor(I->second, Ty);
+    Value *Result = nullptr;
+    if (Exponent & 1)
+      Result = P;
+    for (uint64_t BinExp = 2; BinExp <= Exponent; BinExp <<= 1) {
+      P = InsertBinop(Instruction::Mul, P, P);
+      if (Exponent & BinExp)
+        Result = Result ? InsertBinop(Instruction::Mul, Result, P) : P;
+    }
+
+    I = E;
+    assert(Result && "Nothing was expanded?");
+    return Result;
+  };
+
+  while (I != OpsAndLoops.end()) {
     if (!Prod) {
       // This is the first operand. Just expand it.
-      Prod = expand(Op);
-    } else if (Op->isAllOnesValue()) {
+      Prod = ExpandOpBinPowN();
+    } else if (I->second->isAllOnesValue()) {
       // Instead of doing a multiply by negative one, just do a negate.
       Prod = InsertNoopCastOfTo(Prod, Ty);
       Prod = InsertBinop(Instruction::Sub, Constant::getNullValue(Ty), Prod);
+      ++I;
     } else {
       // A simple mul.
-      Value *W = expandCodeFor(Op, Ty);
+      Value *W = ExpandOpBinPowN();
       Prod = InsertNoopCastOfTo(Prod, Ty);
       // Canonicalize a constant to the RHS.
       if (isa<Constant>(Prod)) std::swap(Prod, W);
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index e920c4c4e6b2b..cd9972ab56a68 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -58,7 +58,7 @@
 //
 // The struct type node has a name and a list of pairs, one pair for each member
 // of the struct. The first element of each pair is a type node (a struct type
-// node or a sclar type node), specifying the type of the member, the second
+// node or a scalar type node), specifying the type of the member, the second
 // element of each pair is the offset of the member.
 //
 // Given an example
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index b065f427b06cb..fd6e3a643bf03 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -686,8 +686,7 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       Known.One  |= RHSKnown.Zero;
     // assume(v >> c = a)
     } else if (match(Arg,
-                     m_c_ICmp(Pred, m_CombineOr(m_LShr(m_V, m_ConstantInt(C)),
-                                                m_AShr(m_V, m_ConstantInt(C))),
+                     m_c_ICmp(Pred, m_Shr(m_V, m_ConstantInt(C)),
                               m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
@@ -698,9 +697,7 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       Known.Zero |= RHSKnown.Zero << C->getZExtValue();
       Known.One  |= RHSKnown.One  << C->getZExtValue();
     // assume(~(v >> c) = a)
-    } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_CombineOr(
-                                             m_LShr(m_V, m_ConstantInt(C)),
-                                             m_AShr(m_V, m_ConstantInt(C)))),
+    } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shr(m_V, m_ConstantInt(C))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
diff --git a/lib/BinaryFormat/Magic.cpp b/lib/BinaryFormat/Magic.cpp
index ca4d93f99d92d..f24f22c88a8aa 100644
--- a/lib/BinaryFormat/Magic.cpp
+++ b/lib/BinaryFormat/Magic.cpp
@@ -51,7 +51,8 @@ file_magic llvm::identify_magic(StringRef Magic) {
       return file_magic::coff_import_library;
     }
     // Windows resource file
-    if (startswith(Magic, "\0\0\0\0\x20\0\0\0\xFF"))
+    if (Magic.size() >= sizeof(COFF::WinResMagic) &&
+        memcmp(Magic.data(), COFF::WinResMagic, sizeof(COFF::WinResMagic)) == 0)
       return file_magic::windows_resource;
     // 0x0000 = COFF unknown machine type
     if (Magic[1] == 0)
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index ad348d723bae0..c48fcaa7b0d1d 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2801,26 +2801,24 @@ void AsmPrinter::emitXRayTable() {
   }
 
   // Before we switch over, we force a reference to a label inside the
-  // xray_instr_map and xray_fn_idx sections. Since this function is always
-  // called just before the function's end, we assume that this is happening
-  // after the last return instruction. We also use the synthetic label in the
-  // xray_inster_map as a delimeter for the range of sleds for this function in
-  // the index.
+  // xray_fn_idx sections. This makes sure that the xray_fn_idx section is kept
+  // live by the linker if the function is not garbage-collected. Since this
+  // function is always called just before the function's end, we assume that
+  // this is happening after the last return instruction.
   auto WordSizeBytes = MAI->getCodePointerSize();
-  MCSymbol *SledsStart = OutContext.createTempSymbol("xray_synthetic_", true);
   MCSymbol *IdxRef = OutContext.createTempSymbol("xray_fn_idx_synth_", true);
   OutStreamer->EmitCodeAlignment(16);
-  OutStreamer->EmitSymbolValue(SledsStart, WordSizeBytes, false);
   OutStreamer->EmitSymbolValue(IdxRef, WordSizeBytes, false);
 
   // Now we switch to the instrumentation map section. Because this is done
   // per-function, we are able to create an index entry that will represent the
   // range of sleds associated with a function.
+  MCSymbol *SledsStart = OutContext.createTempSymbol("xray_sleds_start", true);
   OutStreamer->SwitchSection(InstMap);
   OutStreamer->EmitLabel(SledsStart);
   for (const auto &Sled : Sleds)
     Sled.emit(WordSizeBytes, OutStreamer.get(), CurrentFnSym);
-  MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_synthetic_end", true);
+  MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_sleds_end", true);
   OutStreamer->EmitLabel(SledsEnd);
 
   // We then emit a single entry in the index per function. We use the symbols
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 30bfd7c94e68b..886e6e264b3ec 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -105,7 +105,7 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const {
 }
 
 LLVM_DUMP_METHOD
-void DIEAbbrev::print(raw_ostream &O) {
+void DIEAbbrev::print(raw_ostream &O) const {
   O << "Abbreviation @"
     << format("0x%lx", (long)(intptr_t)this)
     << "  "
@@ -128,7 +128,7 @@ void DIEAbbrev::print(raw_ostream &O) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void DIEAbbrev::dump() {
+LLVM_DUMP_METHOD void DIEAbbrev::dump() const {
   print(dbgs());
 }
 #endif
@@ -268,7 +268,7 @@ void DIE::print(raw_ostream &O, unsigned IndentCount) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void DIE::dump() {
+LLVM_DUMP_METHOD void DIE::dump() const {
   print(dbgs());
 }
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 75eb355bfb543..f1b4d9f20ca96 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -972,16 +972,62 @@ DbgVariable *DwarfDebug::createConcreteVariable(DwarfCompileUnit &TheCU,
   return ConcreteVariables.back().get();
 }
 
-// Determine whether this DBG_VALUE is valid at the beginning of the function.
-static bool validAtEntry(const MachineInstr *MInsn) {
-  auto MBB = MInsn->getParent();
-  // Is it in the entry basic block?
-  if (!MBB->pred_empty())
+/// Determine whether a *singular* DBG_VALUE is valid for the entirety of its
+/// enclosing lexical scope. The check ensures there are no other instructions
+/// in the same lexical scope preceding the DBG_VALUE and that its range is
+/// either open or otherwise rolls off the end of the scope.
+static bool validThroughout(LexicalScopes &LScopes,
+                            const MachineInstr *DbgValue,
+                            const MachineInstr *RangeEnd) {
+  assert(DbgValue->getDebugLoc() && "DBG_VALUE without a debug location");
+  auto MBB = DbgValue->getParent();
+  auto DL = DbgValue->getDebugLoc();
+  auto *LScope = LScopes.findLexicalScope(DL);
+  // Scope doesn't exist; this is a dead DBG_VALUE.
+  if (!LScope)
     return false;
-  for (MachineBasicBlock::const_reverse_iterator I(MInsn); I != MBB->rend(); ++I)
-    if (!(I->isDebugValue() || I->getFlag(MachineInstr::FrameSetup)))
+  auto &LSRange = LScope->getRanges();
+  if (LSRange.size() == 0)
+    return false;
+
+  // Determine if the DBG_VALUE is valid at the beginning of its lexical block.
+  const MachineInstr *LScopeBegin = LSRange.front().first;
+  // Early exit if the lexical scope begins outside of the current block.
+  if (LScopeBegin->getParent() != MBB)
+    return false;
+  MachineBasicBlock::const_reverse_iterator Pred(DbgValue);
+  for (++Pred; Pred != MBB->rend(); ++Pred) {
+    if (Pred->getFlag(MachineInstr::FrameSetup))
+      break;
+    auto PredDL = Pred->getDebugLoc();
+    if (!PredDL || Pred->isMetaInstruction())
+      continue;
+    // Check whether the instruction preceding the DBG_VALUE is in the same
+    // (sub)scope as the DBG_VALUE.
+    if (DL->getScope() == PredDL->getScope())
       return false;
-  return true;
+    auto *PredScope = LScopes.findLexicalScope(PredDL);
+    if (!PredScope || LScope->dominates(PredScope))
+      return false;
+  }
+
+  // If the range of the DBG_VALUE is open-ended, report success.
+  if (!RangeEnd)
+    return true;
+
+  // Fail if there are instructions belonging to our scope in another block.
+  const MachineInstr *LScopeEnd = LSRange.back().second;
+  if (LScopeEnd->getParent() != MBB)
+    return false;
+
+  // Single, constant DBG_VALUEs in the prologue are promoted to be live
+  // throughout the function. This is a hack, presumably for DWARF v2 and not
+  // necessarily correct. It would be much better to use a dbg.declare instead
+  // if we know the constant is live throughout the scope.
+  if (DbgValue->getOperand(0).isImm() && MBB->pred_empty())
+    return true;
+
+  return false;
 }
 
 // Find variables for each lexical scope.
@@ -1016,11 +1062,9 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
     const MachineInstr *MInsn = Ranges.front().first;
     assert(MInsn->isDebugValue() && "History must begin with debug value");
 
-    // Check if there is a single DBG_VALUE, valid throughout the function.
-    // A single constant is also considered valid for the entire function.
+    // Check if there is a single DBG_VALUE, valid throughout the var's scope.
     if (Ranges.size() == 1 &&
-        (MInsn->getOperand(0).isImm() ||
-         (validAtEntry(MInsn) && Ranges.front().second == nullptr))) {
+        validThroughout(LScopes, MInsn, Ranges.front().second)) {
       RegVar->initializeDbgValue(MInsn);
       continue;
     }
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 55a27e2fb79e5..7f3c6da912687 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -92,6 +92,7 @@ add_llvm_library(LLVMCodeGen
   PatchableFunction.cpp
   MIRPrinter.cpp
   MIRPrintingPass.cpp
+  MacroFusion.cpp
   OptimizePHIs.cpp
   ParallelCG.cpp
   PeepholeOptimizer.cpp
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 37e176099ea7a..cb31c21293f44 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -1663,17 +1663,18 @@ class MemCmpExpansion {
   bool IsUsedForZeroCmp;
   const DataLayout &DL;
 
-  int calculateNumBlocks(unsigned Size);
+  unsigned calculateNumBlocks(unsigned Size);
   void createLoadCmpBlocks();
   void createResultBlock();
   void setupResultBlockPHINodes();
   void setupEndBlockPHINodes();
-  void emitLoadCompareBlock(unsigned Index, int LoadSize, int GEPIndex);
+  void emitLoadCompareBlock(unsigned Index, unsigned LoadSize,
+                            unsigned GEPIndex);
   Value *getCompareLoadPairs(unsigned Index, unsigned Size,
                              unsigned &NumBytesProcessed, IRBuilder<> &Builder);
   void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size,
                                          unsigned &NumBytesProcessed);
-  void emitLoadCompareByteBlock(unsigned Index, int GEPIndex);
+  void emitLoadCompareByteBlock(unsigned Index, unsigned GEPIndex);
   void emitMemCmpResultBlock();
   Value *getMemCmpExpansionZeroCase(unsigned Size);
   Value *getMemCmpEqZeroOneBlock(unsigned Size);
@@ -1751,7 +1752,8 @@ void MemCmpExpansion::createResultBlock() {
 // It loads 1 byte from each source of the memcmp parameters with the given
 // GEPIndex. It then subtracts the two loaded values and adds this result to the
 // final phi node for selecting the memcmp result.
-void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, int GEPIndex) {
+void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index,
+                                               unsigned GEPIndex) {
   IRBuilder<> Builder(CI->getContext());
 
   Value *Source1 = CI->getArgOperand(0);
@@ -1833,6 +1835,7 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size,
 
     Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
     Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+    assert(LoadSize <= MaxLoadSize && "Unexpected load type");
 
     Value *Source1 = CI->getArgOperand(0);
     Value *Source2 = CI->getArgOperand(1);
@@ -1851,18 +1854,28 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size,
                                   ConstantInt::get(LoadSizeType, GEPIndex));
     }
 
-    // Load LoadSizeType from the base address.
-    Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-    Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+    // Get a constant or load a value for each source address.
+    Value *LoadSrc1 = nullptr;
+    if (auto *Source1C = dyn_cast<Constant>(Source1))
+      LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
+    if (!LoadSrc1)
+      LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+
+    Value *LoadSrc2 = nullptr;
+    if (auto *Source2C = dyn_cast<Constant>(Source2))
+      LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
+    if (!LoadSrc2)
+      LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
     if (NumLoads != 1) {
       if (LoadSizeType != MaxLoadType) {
-        LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType);
-        LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType);
+        LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
+        LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
       }
       // If we have multiple loads per block, we need to generate a composite
       // comparison using xor+or.
       Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
-      Diff = Builder.CreateZExtOrTrunc(Diff, MaxLoadType);
+      Diff = Builder.CreateZExt(Diff, MaxLoadType);
       XorList.push_back(Diff);
     } else {
       // If there's only one load per block, we just compare the loaded values.
@@ -1926,8 +1939,8 @@ void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(
 // the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
 // a special case through emitLoadCompareByteBlock. The special handling can
 // simply subtract the loaded values and add it to the result phi node.
-void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, int LoadSize,
-                                           int GEPIndex) {
+void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize,
+                                           unsigned GEPIndex) {
   if (LoadSize == 1) {
     MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex);
     return;
@@ -1937,6 +1950,7 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, int LoadSize,
 
   Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
   Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  assert(LoadSize <= MaxLoadSize && "Unexpected load type");
 
   Value *Source1 = CI->getArgOperand(0);
   Value *Source2 = CI->getArgOperand(1);
@@ -1970,8 +1984,8 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, int LoadSize,
   }
 
   if (LoadSizeType != MaxLoadType) {
-    LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType);
-    LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType);
+    LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
+    LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
   }
 
   // Add the loaded values to the phi nodes for calculating memcmp result only
@@ -2034,8 +2048,8 @@ void MemCmpExpansion::emitMemCmpResultBlock() {
   PhiRes->addIncoming(Res, ResBlock.BB);
 }
 
-int MemCmpExpansion::calculateNumBlocks(unsigned Size) {
-  int NumBlocks = 0;
+unsigned MemCmpExpansion::calculateNumBlocks(unsigned Size) {
+  unsigned NumBlocks = 0;
   bool HaveOneByteLoad = false;
   unsigned RemainingSize = Size;
   unsigned LoadSize = MaxLoadSize;
@@ -2104,13 +2118,13 @@ Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) {
   // memcmp sources. It starts with loading using the maximum load size set by
   // the target. It processes any remaining bytes using a load size which is the
   // next smallest power of 2.
-  int LoadSize = MaxLoadSize;
-  int NumBytesToBeProcessed = Size;
+  unsigned LoadSize = MaxLoadSize;
+  unsigned NumBytesToBeProcessed = Size;
   unsigned Index = 0;
   while (NumBytesToBeProcessed) {
     // Calculate how many blocks we can create with the current load size.
-    int NumBlocks = NumBytesToBeProcessed / LoadSize;
-    int GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize;
+    unsigned NumBlocks = NumBytesToBeProcessed / LoadSize;
+    unsigned GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize;
     NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize;
 
     // For each NumBlocks, populate the instruction sequence for loading and
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index dccd8e0706ca6..239bad2f53557 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -582,7 +582,7 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
     MIB.addUse(Zero);
   }
 
-  MIRBuilder.buildSequence(getOrCreateVReg(CI), Res, 0, Overflow, Width);
+  MIRBuilder.buildSequence(getOrCreateVReg(CI), {Res, Overflow}, {0, Width});
   return true;
 }
 
@@ -686,6 +686,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
         .addUse(getOrCreateVReg(*CI.getArgOperand(1)));
     return true;
+  case Intrinsic::fma:
+    MIRBuilder.buildInstr(TargetOpcode::G_FMA)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(1)))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(2)));
+    return true;
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
   case Intrinsic::memset:
diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 4c0b06dffd216..5466efd7e90f4 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -25,6 +25,18 @@ using namespace llvm;
 
 InstructionSelector::InstructionSelector() {}
 
+bool InstructionSelector::constrainOperandRegToRegClass(
+    MachineInstr &I, unsigned OpIdx, const TargetRegisterClass &RC,
+    const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+    const RegisterBankInfo &RBI) const {
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  return llvm::constrainRegToClass(MRI, TII, RBI, I,
+                                   I.getOperand(OpIdx).getReg(), RC);
+}
+
 bool InstructionSelector::constrainSelectedInstRegOperands(
     MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
     const RegisterBankInfo &RBI) const {
diff --git a/lib/CodeGen/GlobalISel/Legalizer.cpp b/lib/CodeGen/GlobalISel/Legalizer.cpp
index 1b50489deeba9..b699156c568b4 100644
--- a/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -50,72 +50,9 @@ void Legalizer::getAnalysisUsage(AnalysisUsage &AU) const {
 void Legalizer::init(MachineFunction &MF) {
 }
 
-bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI,
-                                const TargetInstrInfo &TII) {
-  bool Changed = false;
-  if (MI.getOpcode() != TargetOpcode::G_EXTRACT)
-    return Changed;
-
-  unsigned NumDefs = (MI.getNumOperands() - 1) / 2;
-  unsigned SrcReg = MI.getOperand(NumDefs).getReg();
-  MachineInstr &SeqI = *MRI.def_instr_begin(SrcReg);
-  if (SeqI.getOpcode() != TargetOpcode::G_SEQUENCE)
-      return Changed;
-
-  unsigned NumSeqSrcs = (SeqI.getNumOperands() - 1) / 2;
-  bool AllDefsReplaced = true;
-
-  // Try to match each register extracted with a corresponding insertion formed
-  // by the G_SEQUENCE.
-  for (unsigned Idx = 0, SeqIdx = 0; Idx < NumDefs; ++Idx) {
-    MachineOperand &ExtractMO = MI.getOperand(Idx);
-    assert(ExtractMO.isReg() && ExtractMO.isDef() &&
-           "unexpected extract operand");
-
-    unsigned ExtractReg = ExtractMO.getReg();
-    unsigned ExtractPos = MI.getOperand(NumDefs + Idx + 1).getImm();
-
-    while (SeqIdx < NumSeqSrcs &&
-           SeqI.getOperand(2 * SeqIdx + 2).getImm() < ExtractPos)
-      ++SeqIdx;
-
-    if (SeqIdx == NumSeqSrcs) {
-      AllDefsReplaced = false;
-      continue;
-    }
-
-    unsigned OrigReg = SeqI.getOperand(2 * SeqIdx + 1).getReg();
-    if (SeqI.getOperand(2 * SeqIdx + 2).getImm() != ExtractPos ||
-        MRI.getType(OrigReg) != MRI.getType(ExtractReg)) {
-      AllDefsReplaced = false;
-      continue;
-    }
-
-    assert(!TargetRegisterInfo::isPhysicalRegister(OrigReg) &&
-           "unexpected physical register in G_SEQUENCE");
-
-    // Finally we can replace the uses.
-    MRI.replaceRegWith(ExtractReg, OrigReg);
-  }
-
-  if (AllDefsReplaced) {
-    // If SeqI was the next instruction in the BB and we removed it, we'd break
-    // the outer iteration.
-    assert(std::next(MachineBasicBlock::iterator(MI)) != SeqI &&
-           "G_SEQUENCE does not dominate G_EXTRACT");
-
-    MI.eraseFromParent();
-
-    if (MRI.use_empty(SrcReg))
-      SeqI.eraseFromParent();
-    Changed = true;
-  }
-
-  return Changed;
-}
-
 bool Legalizer::combineMerges(MachineInstr &MI, MachineRegisterInfo &MRI,
-                              const TargetInstrInfo &TII) {
+                              const TargetInstrInfo &TII,
+                              MachineIRBuilder &MIRBuilder) {
   if (MI.getOpcode() != TargetOpcode::G_UNMERGE_VALUES)
     return false;
 
@@ -125,18 +62,62 @@ bool Legalizer::combineMerges(MachineInstr &MI, MachineRegisterInfo &MRI,
   if (MergeI.getOpcode() != TargetOpcode::G_MERGE_VALUES)
     return false;
 
-  if (MergeI.getNumOperands() - 1 != NumDefs)
-    return false;
+  const unsigned NumMergeRegs = MergeI.getNumOperands() - 1;
 
-  // FIXME: is a COPY appropriate if the types mismatch? We know both registers
-  // are allocatable by now.
-  if (MRI.getType(MI.getOperand(0).getReg()) !=
-      MRI.getType(MergeI.getOperand(1).getReg()))
-    return false;
+  if (NumMergeRegs < NumDefs) {
+    if (NumDefs % NumMergeRegs != 0)
+      return false;
+
+    MIRBuilder.setInstr(MI);
+    // Transform to UNMERGEs, for example
+    //   %1 = G_MERGE_VALUES %4, %5
+    //   %9, %10, %11, %12 = G_UNMERGE_VALUES %1
+    // to
+    //   %9, %10 = G_UNMERGE_VALUES %4
+    //   %11, %12 = G_UNMERGE_VALUES %5
+
+    const unsigned NewNumDefs = NumDefs / NumMergeRegs;
+    for (unsigned Idx = 0; Idx < NumMergeRegs; ++Idx) {
+      SmallVector<unsigned, 2> DstRegs;
+      for (unsigned j = 0, DefIdx = Idx * NewNumDefs; j < NewNumDefs;
+           ++j, ++DefIdx)
+        DstRegs.push_back(MI.getOperand(DefIdx).getReg());
+
+      MIRBuilder.buildUnmerge(DstRegs, MergeI.getOperand(Idx + 1).getReg());
+    }
+
+  } else if (NumMergeRegs > NumDefs) {
+    if (NumMergeRegs % NumDefs != 0)
+      return false;
+
+    MIRBuilder.setInstr(MI);
+    // Transform to MERGEs
+    //   %6 = G_MERGE_VALUES %17, %18, %19, %20
+    //   %7, %8 = G_UNMERGE_VALUES %6
+    // to
+    //   %7 = G_MERGE_VALUES %17, %18
+    //   %8 = G_MERGE_VALUES %19, %20
+
+    const unsigned NumRegs = NumMergeRegs / NumDefs;
+    for (unsigned DefIdx = 0; DefIdx < NumDefs; ++DefIdx) {
+      SmallVector<unsigned, 2> Regs;
+      for (unsigned j = 0, Idx = NumRegs * DefIdx + 1; j < NumRegs; ++j, ++Idx)
+        Regs.push_back(MergeI.getOperand(Idx).getReg());
+
+      MIRBuilder.buildMerge(MI.getOperand(DefIdx).getReg(), Regs);
+    }
 
-  for (unsigned Idx = 0; Idx < NumDefs; ++Idx)
-    MRI.replaceRegWith(MI.getOperand(Idx).getReg(),
-                       MergeI.getOperand(Idx + 1).getReg());
+  } else {
+    // FIXME: is a COPY appropriate if the types mismatch? We know both
+    // registers are allocatable by now.
+    if (MRI.getType(MI.getOperand(0).getReg()) !=
+        MRI.getType(MergeI.getOperand(1).getReg()))
+      return false;
+
+    for (unsigned Idx = 0; Idx < NumDefs; ++Idx)
+      MRI.replaceRegWith(MI.getOperand(Idx).getReg(),
+                         MergeI.getOperand(Idx + 1).getReg());
+  }
 
   MI.eraseFromParent();
   if (MRI.use_empty(MergeI.getOperand(0).getReg()))
@@ -226,13 +207,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       // Get the next Instruction before we try to legalize, because there's a
       // good chance MI will be deleted.
       NextMI = std::next(MI);
-
-      // combineExtracts erases MI.
-      if (combineExtracts(*MI, MRI, TII)) {
-        Changed = true;
-        continue;
-      }
-      Changed |= combineMerges(*MI, MRI, TII);
+      Changed |= combineMerges(*MI, MRI, TII, Helper.MIRBuilder);
     }
   }
 
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 4d45910422967..595802f2228b9 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -75,8 +75,7 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const {
 
   // FIXME: the long-term plan calls for expansion in terms of load/store (if
   // they're not legal).
-  if (Aspect.Opcode == TargetOpcode::G_SEQUENCE ||
-      Aspect.Opcode == TargetOpcode::G_EXTRACT ||
+  if (Aspect.Opcode == TargetOpcode::G_EXTRACT ||
       Aspect.Opcode == TargetOpcode::G_MERGE_VALUES ||
       Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES)
     return std::make_pair(Legal, Aspect.Type);
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 79d312fb52ca4..3c70013ea296b 100644
--- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -425,10 +425,8 @@ MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src,
       .addImm(Index);
 }
 
-MachineInstrBuilder
-MachineIRBuilder::buildSequence(unsigned Res,
-                                ArrayRef<unsigned> Ops,
-                                ArrayRef<uint64_t> Indices) {
+void MachineIRBuilder::buildSequence(unsigned Res, ArrayRef<unsigned> Ops,
+                                     ArrayRef<uint64_t> Indices) {
 #ifndef NDEBUG
   assert(Ops.size() == Indices.size() && "incompatible args");
   assert(!Ops.empty() && "invalid trivial sequence");
@@ -440,13 +438,31 @@ MachineIRBuilder::buildSequence(unsigned Res,
     assert(MRI->getType(Op).isValid() && "invalid operand type");
 #endif
 
-  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_SEQUENCE);
-  MIB.addDef(Res);
+  LLT ResTy = MRI->getType(Res);
+  LLT OpTy = MRI->getType(Ops[0]);
+  unsigned OpSize = OpTy.getSizeInBits();
+  bool MaybeMerge = true;
   for (unsigned i = 0; i < Ops.size(); ++i) {
-    MIB.addUse(Ops[i]);
-    MIB.addImm(Indices[i]);
+    if (MRI->getType(Ops[i]) != OpTy || Indices[i] != i * OpSize) {
+      MaybeMerge = false;
+      break;
+    }
+  }
+
+  if (MaybeMerge && Ops.size() * OpSize == ResTy.getSizeInBits()) {
+    buildMerge(Res, Ops);
+    return;
+  }
+
+  unsigned ResIn = MRI->createGenericVirtualRegister(ResTy);
+  buildUndef(ResIn);
+
+  for (unsigned i = 0; i < Ops.size(); ++i) {
+    unsigned ResOut =
+        i + 1 == Ops.size() ? Res : MRI->createGenericVirtualRegister(ResTy);
+    buildInsert(ResOut, ResIn, Ops[i], Indices[i]);
+    ResIn = ResOut;
   }
-  return MIB;
 }
 
 MachineInstrBuilder MachineIRBuilder::buildUndef(unsigned Res) {
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index 254bdf10d804f..5ecaf5c563f82 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -26,6 +26,23 @@
 
 using namespace llvm;
 
+unsigned llvm::constrainRegToClass(MachineRegisterInfo &MRI,
+                                   const TargetInstrInfo &TII,
+                                   const RegisterBankInfo &RBI,
+                                   MachineInstr &InsertPt, unsigned Reg,
+                                   const TargetRegisterClass &RegClass) {
+  if (!RBI.constrainGenericRegister(Reg, RegClass, MRI)) {
+    unsigned NewReg = MRI.createVirtualRegister(&RegClass);
+    BuildMI(*InsertPt.getParent(), InsertPt, InsertPt.getDebugLoc(),
+            TII.get(TargetOpcode::COPY), NewReg)
+        .addReg(Reg);
+    return NewReg;
+  }
+
+  return Reg;
+}
+
+
 unsigned llvm::constrainOperandRegClass(
     const MachineFunction &MF, const TargetRegisterInfo &TRI,
     MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
@@ -36,16 +53,7 @@ unsigned llvm::constrainOperandRegClass(
          "PhysReg not implemented");
 
   const TargetRegisterClass *RegClass = TII.getRegClass(II, OpIdx, &TRI, MF);
-
-  if (!RBI.constrainGenericRegister(Reg, *RegClass, MRI)) {
-    unsigned NewReg = MRI.createVirtualRegister(RegClass);
-    BuildMI(*InsertPt.getParent(), InsertPt, InsertPt.getDebugLoc(),
-            TII.get(TargetOpcode::COPY), NewReg)
-        .addReg(Reg);
-    return NewReg;
-  }
-
-  return Reg;
+  return constrainRegToClass(MRI, TII, RBI, InsertPt, Reg, *RegClass);
 }
 
 bool llvm::isTriviallyDead(const MachineInstr &MI,
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index c98c9b68ac0e4..ff8405366173e 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1474,8 +1474,11 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
     DontKill.addLiveIns(NextMBB);
   }
 
+  // Remove the branches from the entry so we can add the contents of the true
+  // block to it.
+  BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
+
   if (CvtMBB.pred_size() > 1) {
-    BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
     CopyAndPredicateBlock(BBI, *CvtBBI, Cond);
@@ -1484,11 +1487,11 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
     // explicitly remove CvtBBI as a successor.
     BBI.BB->removeSuccessor(&CvtMBB, true);
   } else {
+    // Predicate the instructions in the true block.
     RemoveKills(CvtMBB.begin(), CvtMBB.end(), DontKill, *TRI);
     PredicateBlock(*CvtBBI, CvtMBB.end(), Cond);
 
     // Merge converted block into entry block.
-    BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     MergeBlocks(BBI, *CvtBBI);
   }
 
@@ -1588,8 +1591,11 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     BBCvt = MBPI->getEdgeProbability(BBI.BB, &CvtMBB);
   }
 
+  // Remove the branches from the entry so we can add the contents of the true
+  // block to it.
+  BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
+
   if (CvtMBB.pred_size() > 1) {
-    BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
     CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
@@ -1603,7 +1609,6 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     PredicateBlock(*CvtBBI, CvtMBB.end(), Cond);
 
     // Now merge the entry of the triangle with the true block.
-    BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     MergeBlocks(BBI, *CvtBBI, false);
   }
 
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index b831ddfa601a6..e308f49ec4e85 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -359,30 +359,15 @@ ImplicitNullChecks::isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
         Offset < PageSize))
     return SR_Unsuitable;
 
-  // Finally, we need to make sure that the access instruction actually is
-  // accessing from PointerReg, and there isn't some re-definition of PointerReg
-  // between the compare and the memory access.
-  // If PointerReg has been redefined before then there is no sense to continue
-  // lookup due to this condition will fail for any further instruction.
-  SuitabilityResult Suitable = SR_Suitable;
-  for (auto *PrevMI : PrevInsts)
-    for (auto &PrevMO : PrevMI->operands()) {
-      if (PrevMO.isReg() && PrevMO.getReg() && PrevMO.isDef() &&
-          TRI->regsOverlap(PrevMO.getReg(), PointerReg))
-        return SR_Impossible;
-
-      // Check whether the current memory access aliases with previous one.
-      // If we already found that it aliases then no need to continue.
-      // But we continue base pointer check as it can result in SR_Impossible.
-      if (Suitable == SR_Suitable) {
-        AliasResult AR = areMemoryOpsAliased(MI, PrevMI);
-        if (AR == AR_WillAliasEverything)
-          return SR_Impossible;
-        if (AR == AR_MayAlias)
-          Suitable = SR_Unsuitable;
-      }
-    }
-  return Suitable;
+  // Finally, check whether the current memory access aliases with previous one.
+  for (auto *PrevMI : PrevInsts) {
+    AliasResult AR = areMemoryOpsAliased(MI, PrevMI);
+    if (AR == AR_WillAliasEverything)
+      return SR_Impossible;
+    if (AR == AR_MayAlias)
+      return SR_Unsuitable;
+  }
+  return SR_Suitable;
 }
 
 bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI,
@@ -569,6 +554,12 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
       return true;
     }
 
+    // If MI re-defines the PointerReg then we cannot move further.
+    if (any_of(MI.operands(), [&](MachineOperand &MO) {
+          return MO.isReg() && MO.getReg() && MO.isDef() &&
+                 TRI->regsOverlap(MO.getReg(), PointerReg);
+        }))
+      return false;
     InstsSeenSoFar.push_back(&MI);
   }
 
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index bbd783367c9e8..0c76478af551f 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -1006,7 +1006,7 @@ bool LiveDebugVariables::doInitialization(Module &M) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void LiveDebugVariables::dump() {
+LLVM_DUMP_METHOD void LiveDebugVariables::dump() const {
   if (pImpl)
     static_cast<LDVImpl*>(pImpl)->print(dbgs());
 }
diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index afe87a52544d8..1d7e3d4371a24 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -59,7 +59,7 @@ public:
   void emitDebugValues(VirtRegMap *VRM);
 
   /// dump - Print data structures to dbgs().
-  void dump();
+  void dump() const;
 
 private:
 
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 590acc01008a6..81597afe6b02b 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -228,6 +228,12 @@ LLVM_DUMP_METHOD void MachineBasicBlock::dump() const {
 }
 #endif
 
+bool MachineBasicBlock::isLegalToHoistInto() const {
+  if (isReturnBlock() || hasEHPadSuccessor())
+    return false;
+  return true;
+}
+
 StringRef MachineBasicBlock::getName() const {
   if (const BasicBlock *LBB = getBasicBlock())
     return LBB->getName();
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 2a6cb07dbd2da..81c6dace92e04 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -558,6 +559,23 @@ unsigned MachinePointerInfo::getAddrSpace() const {
   return cast<PointerType>(V.get<const Value*>()->getType())->getAddressSpace();
 }
 
+/// isDereferenceable - Return true if V is always dereferenceable for 
+/// Offset + Size byte.
+bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
+                                           const DataLayout &DL) const {
+  if (!V.is<const Value*>())
+    return false;
+
+  const Value *BasePtr = V.get<const Value*>();
+  if (BasePtr == nullptr)
+    return false;
+
+  return isDereferenceableAndAlignedPointer(BasePtr, 1,
+                                            APInt(DL.getPointerSize(),
+                                                  Offset + Size),
+                                            DL);
+}
+
 /// getConstantPool - Return a MachinePointerInfo record that refers to the
 /// constant pool.
 MachinePointerInfo MachinePointerInfo::getConstantPool(MachineFunction &MF) {
diff --git a/lib/CodeGen/MachineModuleInfoImpls.cpp b/lib/CodeGen/MachineModuleInfoImpls.cpp
index 4c81fd91cb829..22d519e5d88fa 100644
--- a/lib/CodeGen/MachineModuleInfoImpls.cpp
+++ b/lib/CodeGen/MachineModuleInfoImpls.cpp
@@ -23,7 +23,6 @@ using namespace llvm;
 // Out of line virtual method.
 void MachineModuleInfoMachO::anchor() {}
 void MachineModuleInfoELF::anchor() {}
-void MachineModuleInfoWasm::anchor() {}
 
 static int SortSymbolPair(const void *LHS, const void *RHS) {
   typedef std::pair<MCSymbol*, MachineModuleInfoImpl::StubValueTy> PairTy;
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 01a2286b8d66a..eaba9a58557c3 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -542,10 +542,10 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void ReadyQueue::dump() {
+LLVM_DUMP_METHOD void ReadyQueue::dump() const {
   dbgs() << "Queue " << Name << ": ";
-  for (unsigned i = 0, e = Queue.size(); i < e; ++i)
-    dbgs() << Queue[i]->NodeNum << " ";
+  for (const SUnit *SU : Queue)
+    dbgs() << SU->NodeNum << " ";
   dbgs() << "\n";
 }
 #endif
@@ -609,10 +609,8 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
 
 /// releaseSuccessors - Call releaseSucc on each of SU's successors.
 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
-  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    releaseSucc(SU, &*I);
-  }
+  for (SDep &Succ : SU->Succs)
+    releaseSucc(SU, &Succ);
 }
 
 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
@@ -648,10 +646,8 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
 
 /// releasePredecessors - Call releasePred on each of SU's predecessors.
 void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    releasePred(SU, &*I);
-  }
+  for (SDep &Pred : SU->Preds)
+    releasePred(SU, &Pred);
 }
 
 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
@@ -724,8 +720,8 @@ void ScheduleDAGMI::schedule() {
   DEBUG(
     if (EntrySU.getInstr() != nullptr)
       EntrySU.dumpAll(this);
-    for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-      SUnits[su].dumpAll(this);
+    for (const SUnit &SU : SUnits)
+      SU.dumpAll(this);
     if (ExitSU.getInstr() != nullptr)
       ExitSU.dumpAll(this);
   );
@@ -786,28 +782,25 @@ void ScheduleDAGMI::schedule() {
 
 /// Apply each ScheduleDAGMutation step in order.
 void ScheduleDAGMI::postprocessDAG() {
-  for (unsigned i = 0, e = Mutations.size(); i < e; ++i) {
-    Mutations[i]->apply(this);
-  }
+  for (auto &m : Mutations)
+    m->apply(this);
 }
 
 void ScheduleDAGMI::
 findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
                       SmallVectorImpl<SUnit*> &BotRoots) {
-  for (std::vector<SUnit>::iterator
-         I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
-    SUnit *SU = &(*I);
-    assert(!SU->isBoundaryNode() && "Boundary node should not be in SUnits");
+  for (SUnit &SU : SUnits) {
+    assert(!SU.isBoundaryNode() && "Boundary node should not be in SUnits");
 
     // Order predecessors so DFSResult follows the critical path.
-    SU->biasCriticalPath();
+    SU.biasCriticalPath();
 
     // A SUnit is ready to top schedule if it has no predecessors.
-    if (!I->NumPredsLeft)
-      TopRoots.push_back(SU);
+    if (!SU.NumPredsLeft)
+      TopRoots.push_back(&SU);
     // A SUnit is ready to bottom schedule if it has no successors.
-    if (!I->NumSuccsLeft)
-      BotRoots.push_back(SU);
+    if (!SU.NumSuccsLeft)
+      BotRoots.push_back(&SU);
   }
   ExitSU.biasCriticalPath();
 }
@@ -822,10 +815,9 @@ void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
   //
   // Nodes with unreleased weak edges can still be roots.
   // Release top roots in forward order.
-  for (SmallVectorImpl<SUnit*>::const_iterator
-         I = TopRoots.begin(), E = TopRoots.end(); I != E; ++I) {
-    SchedImpl->releaseTopNode(*I);
-  }
+  for (SUnit *SU : TopRoots)
+    SchedImpl->releaseTopNode(SU);
+
   // Release bottom roots in reverse order so the higher priority nodes appear
   // first. This is more natural and slightly more efficient.
   for (SmallVectorImpl<SUnit*>::const_reverse_iterator
@@ -1029,9 +1021,9 @@ void ScheduleDAGMILive::initRegPressure() {
     }
   }
   DEBUG(dbgs() << "Excess PSets: ";
-        for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
+        for (const PressureChange &RCPS : RegionCriticalPSets)
           dbgs() << TRI->getRegPressureSetName(
-            RegionCriticalPSets[i].getPSet()) << " ";
+            RCPS.getPSet()) << " ";
         dbgs() << "\n");
 }
 
@@ -1040,11 +1032,10 @@ updateScheduledPressure(const SUnit *SU,
                         const std::vector<unsigned> &NewMaxPressure) {
   const PressureDiff &PDiff = getPressureDiff(SU);
   unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
-  for (PressureDiff::const_iterator I = PDiff.begin(), E = PDiff.end();
-       I != E; ++I) {
-    if (!I->isValid())
+  for (const PressureChange &PC : PDiff) {
+    if (!PC.isValid())
       break;
-    unsigned ID = I->getPSet();
+    unsigned ID = PC.getPSet();
     while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
       ++CritIdx;
     if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
@@ -1508,8 +1499,7 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII,
 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
-  for (unsigned Idx = 0, End = MemOps.size(); Idx != End; ++Idx) {
-    SUnit *SU = MemOps[Idx];
+  for (SUnit *SU : MemOps) {
     unsigned BaseReg;
     int64_t Offset;
     if (TII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseReg, Offset, TRI))
@@ -1537,12 +1527,11 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
       // dependent on SUa can prevent load combining due to register reuse.
       // Predecessor edges do not need to be copied from SUb to SUa since nearby
       // loads should have effectively the same inputs.
-      for (SUnit::const_succ_iterator
-             SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) {
-        if (SI->getSUnit() == SUb)
+      for (const SDep &Succ : SUa->Succs) {
+        if (Succ.getSUnit() == SUb)
           continue;
-        DEBUG(dbgs() << "  Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n");
-        DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));
+        DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n");
+        DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
       }
       ++ClusterLength;
     } else
@@ -1559,17 +1548,15 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
   DenseMap<unsigned, unsigned> StoreChainIDs;
   // Map each store chain to a set of dependent MemOps.
   SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
-  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
-    SUnit *SU = &DAG->SUnits[Idx];
-    if ((IsLoad && !SU->getInstr()->mayLoad()) ||
-        (!IsLoad && !SU->getInstr()->mayStore()))
+  for (SUnit &SU : DAG->SUnits) {
+    if ((IsLoad && !SU.getInstr()->mayLoad()) ||
+        (!IsLoad && !SU.getInstr()->mayStore()))
       continue;
 
     unsigned ChainPredID = DAG->SUnits.size();
-    for (SUnit::const_pred_iterator
-           PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
-      if (PI->isCtrl()) {
-        ChainPredID = PI->getSUnit()->NodeNum;
+    for (const SDep &Pred : SU.Preds) {
+      if (Pred.isCtrl()) {
+        ChainPredID = Pred.getSUnit()->NodeNum;
         break;
       }
     }
@@ -1580,12 +1567,12 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
       StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
     if (Result.second)
       StoreChainDependents.resize(NumChains + 1);
-    StoreChainDependents[Result.first->second].push_back(SU);
+    StoreChainDependents[Result.first->second].push_back(&SU);
   }
 
   // Iterate over the store chains.
-  for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
-    clusterNeighboringMemOps(StoreChainDependents[Idx], DAG);
+  for (auto &SCD : StoreChainDependents)
+    clusterNeighboringMemOps(SCD, DAG);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1728,16 +1715,14 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
   const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
   MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
   SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
-  for (SUnit::const_succ_iterator
-         I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end();
-       I != E; ++I) {
-    if (I->getKind() != SDep::Data || I->getReg() != LocalReg)
+  for (const SDep &Succ : LastLocalSU->Succs) {
+    if (Succ.getKind() != SDep::Data || Succ.getReg() != LocalReg)
       continue;
-    if (I->getSUnit() == GlobalSU)
+    if (Succ.getSUnit() == GlobalSU)
       continue;
-    if (!DAG->canAddEdge(GlobalSU, I->getSUnit()))
+    if (!DAG->canAddEdge(GlobalSU, Succ.getSUnit()))
       return;
-    LocalUses.push_back(I->getSUnit());
+    LocalUses.push_back(Succ.getSUnit());
   }
   // Open the top of the GlobalLI hole by constraining any earlier global uses
   // to precede the start of LocalLI.
@@ -1745,15 +1730,14 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
   MachineInstr *FirstLocalDef =
     LIS->getInstructionFromIndex(LocalLI->beginIndex());
   SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
-  for (SUnit::const_pred_iterator
-         I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) {
-    if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg)
+  for (const SDep &Pred : GlobalSU->Preds) {
+    if (Pred.getKind() != SDep::Anti || Pred.getReg() != GlobalReg)
       continue;
-    if (I->getSUnit() == FirstLocalSU)
+    if (Pred.getSUnit() == FirstLocalSU)
       continue;
-    if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit()))
+    if (!DAG->canAddEdge(FirstLocalSU, Pred.getSUnit()))
       return;
-    GlobalUses.push_back(I->getSUnit());
+    GlobalUses.push_back(Pred.getSUnit());
   }
   DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
   // Add the weak edges.
@@ -1784,12 +1768,11 @@ void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) {
   RegionEndIdx = DAG->getLIS()->getInstructionIndex(
       *priorNonDebug(DAG->end(), DAG->begin()));
 
-  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
-    SUnit *SU = &DAG->SUnits[Idx];
-    if (!SU->getInstr()->isCopy())
+  for (SUnit &SU : DAG->SUnits) {
+    if (!SU.getInstr()->isCopy())
       continue;
 
-    constrainLocalCopy(SU, static_cast<ScheduleDAGMILive*>(DAG));
+    constrainLocalCopy(&SU, static_cast<ScheduleDAGMILive*>(DAG));
   }
 }
 
@@ -1840,10 +1823,9 @@ init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
   if (!SchedModel->hasInstrSchedModel())
     return;
   RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
-  for (std::vector<SUnit>::iterator
-         I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
-    const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
-    RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC)
+  for (SUnit &SU : DAG->SUnits) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(&SU);
+    RemIssueCount += SchedModel->getNumMicroOps(SU.getInstr(), SC)
       * SchedModel->getMicroOpFactor();
     for (TargetSchedModel::ProcResIter
            PI = SchedModel->getWriteProcResBegin(SC),
@@ -1957,12 +1939,11 @@ unsigned SchedBoundary::
 findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
   SUnit *LateSU = nullptr;
   unsigned RemLatency = 0;
-  for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end();
-       I != E; ++I) {
-    unsigned L = getUnscheduledLatency(*I);
+  for (SUnit *SU : ReadySUs) {
+    unsigned L = getUnscheduledLatency(SU);
     if (L > RemLatency) {
       RemLatency = L;
-      LateSU = *I;
+      LateSU = SU;
     }
   }
   if (LateSU) {
@@ -2328,7 +2309,7 @@ SUnit *SchedBoundary::pickOnlyChoice() {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // This is useful information to dump after bumpNode.
 // Note that the Queue contents are more useful before pickNodeFromQueue.
-LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() {
+LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() const {
   unsigned ResFactor;
   unsigned ResCount;
   if (ZoneCritResIdx) {
@@ -2667,7 +2648,7 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
   }
 }
 
-void GenericScheduler::dumpPolicy() {
+void GenericScheduler::dumpPolicy() const {
   // Cannot completely remove virtual function even in release mode.
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "GenericScheduler RegionPolicy: "
@@ -2719,10 +2700,9 @@ void GenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
 
   // Some roots may not feed into ExitSU. Check all of them in case.
-  for (std::vector<SUnit*>::const_iterator
-         I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
-    if ((*I)->getDepth() > Rem.CriticalPath)
-      Rem.CriticalPath = (*I)->getDepth();
+  for (const SUnit *SU : Bot.Available) {
+    if (SU->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = SU->getDepth();
   }
   DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
   if (DumpCriticalPathLength) {
@@ -2969,10 +2949,10 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
 
   ReadyQueue &Q = Zone.Available;
-  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+  for (SUnit *SU : Q) {
 
     SchedCandidate TryCand(ZonePolicy);
-    initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker);
+    initCandidate(TryCand, SU, Zone.isTop(), RPTracker, TempTracker);
     // Pass SchedBoundary only when comparing nodes from the same boundary.
     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
     tryCandidate(Cand, TryCand, ZoneArg);
@@ -3118,18 +3098,17 @@ void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
 
   // Find already scheduled copies with a single physreg dependence and move
   // them just above the scheduled instruction.
-  for (SmallVectorImpl<SDep>::iterator I = Deps.begin(), E = Deps.end();
-       I != E; ++I) {
-    if (I->getKind() != SDep::Data || !TRI->isPhysicalRegister(I->getReg()))
+  for (SDep &Dep : Deps) {
+    if (Dep.getKind() != SDep::Data || !TRI->isPhysicalRegister(Dep.getReg()))
       continue;
-    SUnit *DepSU = I->getSUnit();
+    SUnit *DepSU = Dep.getSUnit();
     if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
       continue;
     MachineInstr *Copy = DepSU->getInstr();
     if (!Copy->isCopy())
       continue;
     DEBUG(dbgs() << "  Rescheduling physreg copy ";
-          I->getSUnit()->dump(DAG));
+          Dep.getSUnit()->dump(DAG));
     DAG->moveInstruction(Copy, InsertPos);
   }
 }
@@ -3204,10 +3183,9 @@ void PostGenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
 
   // Some roots may not feed into ExitSU. Check all of them in case.
-  for (SmallVectorImpl<SUnit*>::const_iterator
-         I = BotRoots.begin(), E = BotRoots.end(); I != E; ++I) {
-    if ((*I)->getDepth() > Rem.CriticalPath)
-      Rem.CriticalPath = (*I)->getDepth();
+  for (const SUnit *SU : BotRoots) {
+    if (SU->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = SU->getDepth();
   }
   DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
   if (DumpCriticalPathLength) {
@@ -3260,9 +3238,9 @@ void PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
 
 void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
   ReadyQueue &Q = Top.Available;
-  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+  for (SUnit *SU : Q) {
     SchedCandidate TryCand(Cand.Policy);
-    TryCand.SU = *I;
+    TryCand.SU = SU;
     TryCand.AtTop = true;
     TryCand.initResourceDelta(DAG, SchedModel);
     tryCandidate(Cand, TryCand);
diff --git a/lib/CodeGen/MacroFusion.cpp b/lib/CodeGen/MacroFusion.cpp
new file mode 100644
index 0000000000000..45ea0e4c39ab4
--- /dev/null
+++ b/lib/CodeGen/MacroFusion.cpp
@@ -0,0 +1,150 @@
+//===- MacroFusion.cpp - Macro Fusion ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the implementation of the DAG scheduling mutation
+/// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define DEBUG_TYPE "misched"
+
+STATISTIC(NumFused, "Number of instr pairs fused");
+
+using namespace llvm;
+
+static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
+namespace {
+
+static void fuseInstructionPair(ScheduleDAGMI &DAG, SUnit &FirstSU,
+                                SUnit &SecondSU) {
+  // Create a single weak edge between the adjacent instrs. The only effect is
+  // to cause bottom-up scheduling to heavily prioritize the clustered instrs.
+  DAG.addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster));
+
+  // Adjust the latency between the anchor instr and its
+  // predecessors.
+  for (SDep &IDep : SecondSU.Preds)
+    if (IDep.getSUnit() == &FirstSU)
+      IDep.setLatency(0);
+
+  // Adjust the latency between the dependent instr and its
+  // predecessors.
+  for (SDep &IDep : FirstSU.Succs)
+    if (IDep.getSUnit() == &SecondSU)
+      IDep.setLatency(0);
+
+  DEBUG(dbgs() << DAG.MF.getName() << "(): Macro fuse ";
+        FirstSU.print(dbgs(), &DAG); dbgs() << " - ";
+        SecondSU.print(dbgs(), &DAG); dbgs() << " /  ";
+        dbgs() << DAG.TII->getName(FirstSU.getInstr()->getOpcode()) << " - " <<
+                  DAG.TII->getName(SecondSU.getInstr()->getOpcode()) << '\n'; );
+
+  if (&SecondSU != &DAG.ExitSU)
+    // Make instructions dependent on FirstSU also dependent on SecondSU to
+    // prevent them from being scheduled between FirstSU and and SecondSU.
+    for (const SDep &SI : FirstSU.Succs) {
+      if (SI.getSUnit() == &SecondSU)
+        continue;
+      DEBUG(dbgs() << "  Copy Succ ";
+            SI.getSUnit()->print(dbgs(), &DAG); dbgs() << '\n';);
+      DAG.addEdge(SI.getSUnit(), SDep(&SecondSU, SDep::Artificial));
+    }
+
+  ++NumFused;
+}
+
+
+/// \brief Post-process the DAG to create cluster edges between instrs that may
+/// be fused by the processor into a single operation.
+class MacroFusion : public ScheduleDAGMutation {
+  ShouldSchedulePredTy shouldScheduleAdjacent;
+  bool FuseBlock;
+  bool scheduleAdjacentImpl(ScheduleDAGMI &DAG, SUnit &AnchorSU);
+
+public:
+  MacroFusion(ShouldSchedulePredTy shouldScheduleAdjacent, bool FuseBlock)
+    : shouldScheduleAdjacent(shouldScheduleAdjacent), FuseBlock(FuseBlock) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+  if (FuseBlock)
+    // For each of the SUnits in the scheduling block, try to fuse the instr in
+    // it with one in its predecessors.
+    for (SUnit &ISU : DAG->SUnits)
+        scheduleAdjacentImpl(*DAG, ISU);
+
+  if (DAG->ExitSU.getInstr())
+    // Try to fuse the instr in the ExitSU with one in its predecessors.
+    scheduleAdjacentImpl(*DAG, DAG->ExitSU);
+}
+
+/// \brief Implement the fusion of instr pairs in the scheduling DAG,
+/// anchored at the instr in AnchorSU..
+bool MacroFusion::scheduleAdjacentImpl(ScheduleDAGMI &DAG, SUnit &AnchorSU) {
+  const MachineInstr &AnchorMI = *AnchorSU.getInstr();
+  const TargetInstrInfo &TII = *DAG.TII;
+  const TargetSubtargetInfo &ST = DAG.MF.getSubtarget();
+
+  // Check if the anchor instr may be fused.
+  if (!shouldScheduleAdjacent(TII, ST, nullptr, AnchorMI))
+    return false;
+
+  // Explorer for fusion candidates among the dependencies of the anchor instr.
+  for (SDep &Dep : AnchorSU.Preds) {
+    // Ignore dependencies that don't enforce ordering.
+    if (Dep.getKind() == SDep::Anti || Dep.getKind() == SDep::Output ||
+        Dep.isWeak())
+      continue;
+
+    SUnit &DepSU = *Dep.getSUnit();
+    if (DepSU.isBoundaryNode())
+      continue;
+
+    const MachineInstr *DepMI = DepSU.getInstr();
+    if (!shouldScheduleAdjacent(TII, ST, DepMI, AnchorMI))
+      continue;
+
+    fuseInstructionPair(DAG, DepSU, AnchorSU);
+    return true;
+  }
+
+  return false;
+}
+
+} // end anonymous namespace
+
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation>
+createMacroFusionDAGMutation(ShouldSchedulePredTy shouldScheduleAdjacent) {
+  if(EnableMacroFusion)
+    return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, true);
+  return nullptr;
+}
+
+std::unique_ptr<ScheduleDAGMutation>
+createBranchMacroFusionDAGMutation(ShouldSchedulePredTy shouldScheduleAdjacent) {
+  if(EnableMacroFusion)
+    return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, false);
+  return nullptr;
+}
+
+} // end namespace llvm
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 1aed58c36e17d..05e641d9489d9 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
 #include <cassert>
 #include <iterator>
 #include <limits>
@@ -260,6 +261,14 @@ void RegScavenger::backward() {
   const MachineInstr &MI = *MBBI;
   LiveUnits.stepBackward(MI);
 
+  // Expire scavenge spill frameindex uses.
+  for (ScavengedInfo &I : Scavenged) {
+    if (I.Restore == &MI) {
+      I.Reg = 0;
+      I.Restore = nullptr;
+    }
+  }
+
   if (MBBI == MBB->begin()) {
     MBBI = MachineBasicBlock::iterator(nullptr);
     Tracking = false;
@@ -356,6 +365,80 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
   return Survivor;
 }
 
+/// Given the bitvector \p Available of free register units at position
+/// \p From. Search backwards to find a register that is part of \p
+/// Candidates and not used/clobbered until the point \p To. If there is
+/// multiple candidates continue searching and pick the one that is not used/
+/// clobbered for the longest time.
+/// Returns the register and the earliest position we know it to be free or
+/// the position MBB.end() if no register is available.
+static std::pair<MCPhysReg, MachineBasicBlock::iterator>
+findSurvivorBackwards(const MachineRegisterInfo &MRI,
+    MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
+    const LiveRegUnits &LiveOut, ArrayRef<MCPhysReg> AllocationOrder) {
+  bool FoundTo = false;
+  MCPhysReg Survivor = 0;
+  MachineBasicBlock::iterator Pos;
+  MachineBasicBlock &MBB = *From->getParent();
+  unsigned InstrLimit = 25;
+  unsigned InstrCountDown = InstrLimit;
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  LiveRegUnits Used(TRI);
+
+  for (MachineBasicBlock::iterator I = From;; --I) {
+    const MachineInstr &MI = *I;
+
+    Used.accumulateBackward(MI);
+
+    if (I == To) {
+      // See if one of the registers in RC wasn't used so far.
+      for (MCPhysReg Reg : AllocationOrder) {
+        if (!MRI.isReserved(Reg) && Used.available(Reg) &&
+            LiveOut.available(Reg))
+          return std::make_pair(Reg, MBB.end());
+      }
+      // Otherwise we will continue up to InstrLimit instructions to find
+      // the register which is not defined/used for the longest time.
+      FoundTo = true;
+      Pos = To;
+    }
+    if (FoundTo) {
+      if (Survivor == 0 || !Used.available(Survivor)) {
+        MCPhysReg AvilableReg = 0;
+        for (MCPhysReg Reg : AllocationOrder) {
+          if (!MRI.isReserved(Reg) && Used.available(Reg)) {
+            AvilableReg = Reg;
+            break;
+          }
+        }
+        if (AvilableReg == 0)
+          break;
+        Survivor = AvilableReg;
+      }
+      if (--InstrCountDown == 0)
+        break;
+
+      // Keep searching when we find a vreg since the spilled register will
+      // be usefull for this other vreg as well later.
+      bool FoundVReg = false;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+          FoundVReg = true;
+          break;
+        }
+      }
+      if (FoundVReg) {
+        InstrCountDown = InstrLimit;
+        Pos = I;
+      }
+      if (I == MBB.begin())
+        break;
+    }
+  }
+
+  return std::make_pair(Survivor, Pos);
+}
+
 static unsigned getFrameIndexOperandNum(MachineInstr &MI) {
   unsigned i = 0;
   while (!MI.getOperand(i).isFI()) {
@@ -365,44 +448,16 @@ static unsigned getFrameIndexOperandNum(MachineInstr &MI) {
   return i;
 }
 
-unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
-                                        MachineBasicBlock::iterator I,
-                                        int SPAdj) {
-  MachineInstr &MI = *I;
-  const MachineFunction &MF = *MI.getParent()->getParent();
-  // Consider all allocatable registers in the register class initially
-  BitVector Candidates = TRI->getAllocatableSet(MF, RC);
-
-  // Exclude all the registers being used by the instruction.
-  for (const MachineOperand &MO : MI.operands()) {
-    if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) &&
-        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-      for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
-        Candidates.reset(*AI);
-  }
-
-  // Try to find a register that's unused if there is one, as then we won't
-  // have to spill.
-  BitVector Available = getRegsAvailable(RC);
-  Available &= Candidates;
-  if (Available.any())
-    Candidates = Available;
-
-  // Find the register whose use is furthest away.
-  MachineBasicBlock::iterator UseMI;
-  unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
-
-  // If we found an unused register there is no reason to spill it.
-  if (!isRegUsed(SReg)) {
-    DEBUG(dbgs() << "Scavenged register: " << TRI->getName(SReg) << "\n");
-    return SReg;
-  }
-
+RegScavenger::ScavengedInfo &
+RegScavenger::spill(unsigned Reg, const TargetRegisterClass &RC, int SPAdj,
+                    MachineBasicBlock::iterator Before,
+                    MachineBasicBlock::iterator &UseMI) {
   // Find an available scavenging slot with size and alignment matching
   // the requirements of the class RC.
+  const MachineFunction &MF = *Before->getParent()->getParent();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned NeedSize = TRI->getSpillSize(*RC);
-  unsigned NeedAlign = TRI->getSpillAlignment(*RC);
+  unsigned NeedSize = TRI->getSpillSize(RC);
+  unsigned NeedAlign = TRI->getSpillAlignment(RC);
 
   unsigned SI = Scavenged.size(), Diff = std::numeric_limits<unsigned>::max();
   int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd();
@@ -437,39 +492,72 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   }
 
   // Avoid infinite regress
-  Scavenged[SI].Reg = SReg;
+  Scavenged[SI].Reg = Reg;
 
   // If the target knows how to save/restore the register, let it do so;
   // otherwise, use the emergency stack spill slot.
-  if (!TRI->saveScavengerRegister(*MBB, I, UseMI, RC, SReg)) {
-    // Spill the scavenged register before I.
+  if (!TRI->saveScavengerRegister(*MBB, Before, UseMI, &RC, Reg)) {
+    // Spill the scavenged register before \p Before.
     int FI = Scavenged[SI].FrameIndex;
     if (FI < FIB || FI >= FIE) {
       std::string Msg = std::string("Error while trying to spill ") +
-          TRI->getName(SReg) + " from class " + TRI->getRegClassName(RC) +
+          TRI->getName(Reg) + " from class " + TRI->getRegClassName(&RC) +
           ": Cannot scavenge register without an emergency spill slot!";
       report_fatal_error(Msg.c_str());
     }
-    TII->storeRegToStackSlot(*MBB, I, SReg, true, Scavenged[SI].FrameIndex,
-                             RC, TRI);
-    MachineBasicBlock::iterator II = std::prev(I);
+    TII->storeRegToStackSlot(*MBB, Before, Reg, true, Scavenged[SI].FrameIndex,
+                             &RC, TRI);
+    MachineBasicBlock::iterator II = std::prev(Before);
 
     unsigned FIOperandNum = getFrameIndexOperandNum(*II);
     TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
 
     // Restore the scavenged register before its use (or first terminator).
-    TII->loadRegFromStackSlot(*MBB, UseMI, SReg, Scavenged[SI].FrameIndex,
-                              RC, TRI);
+    TII->loadRegFromStackSlot(*MBB, UseMI, Reg, Scavenged[SI].FrameIndex,
+                              &RC, TRI);
     II = std::prev(UseMI);
 
     FIOperandNum = getFrameIndexOperandNum(*II);
     TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
   }
+  return Scavenged[SI];
+}
 
-  Scavenged[SI].Restore = &*std::prev(UseMI);
+unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
+                                        MachineBasicBlock::iterator I,
+                                        int SPAdj) {
+  MachineInstr &MI = *I;
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  // Consider all allocatable registers in the register class initially
+  BitVector Candidates = TRI->getAllocatableSet(MF, RC);
 
-  // Doing this here leads to infinite regress.
-  // Scavenged[SI].Reg = SReg;
+  // Exclude all the registers being used by the instruction.
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) &&
+        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
+        Candidates.reset(*AI);
+  }
+
+  // Try to find a register that's unused if there is one, as then we won't
+  // have to spill.
+  BitVector Available = getRegsAvailable(RC);
+  Available &= Candidates;
+  if (Available.any())
+    Candidates = Available;
+
+  // Find the register whose use is furthest away.
+  MachineBasicBlock::iterator UseMI;
+  unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
+
+  // If we found an unused register there is no reason to spill it.
+  if (!isRegUsed(SReg)) {
+    DEBUG(dbgs() << "Scavenged register: " << TRI->getName(SReg) << "\n");
+    return SReg;
+  }
+
+  ScavengedInfo &Scavenged = spill(SReg, *RC, SPAdj, I, UseMI);
+  Scavenged.Restore = &*std::prev(UseMI);
 
   DEBUG(dbgs() << "Scavenged register (with spill): " << TRI->getName(SReg) <<
         "\n");
@@ -477,85 +565,195 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   return SReg;
 }
 
-void llvm::scavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger &RS) {
-  // FIXME: Iterating over the instruction stream is unnecessary. We can simply
-  // iterate over the vreg use list, which at this point only contains machine
-  // operands for which eliminateFrameIndex need a new scratch reg.
+unsigned RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
+                                                 MachineBasicBlock::iterator To,
+                                                 bool RestoreAfter, int SPAdj) {
+  const MachineBasicBlock &MBB = *To->getParent();
+  const MachineFunction &MF = *MBB.getParent();
 
-  // Run through the instructions and find any virtual registers.
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (MachineBasicBlock &MBB : MF) {
-    RS.enterBasicBlock(MBB);
-
-    int SPAdj = 0;
-
-    // The instruction stream may change in the loop, so check MBB.end()
-    // directly.
-    for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
-      // We might end up here again with a NULL iterator if we scavenged a
-      // register for which we inserted spill code for definition by what was
-      // originally the first instruction in MBB.
-      if (I == MachineBasicBlock::iterator(nullptr))
-        I = MBB.begin();
-
-      const MachineInstr &MI = *I;
-      MachineBasicBlock::iterator J = std::next(I);
-      MachineBasicBlock::iterator P =
-                         I == MBB.begin() ? MachineBasicBlock::iterator(nullptr)
-                                          : std::prev(I);
-
-      // RS should process this instruction before we might scavenge at this
-      // location. This is because we might be replacing a virtual register
-      // defined by this instruction, and if so, registers killed by this
-      // instruction are available, and defined registers are not.
-      RS.forward(I);
+  // Find the register whose use is furthest away.
+  MachineBasicBlock::iterator UseMI;
+  ArrayRef<MCPhysReg> AllocationOrder = RC.getRawAllocationOrder(MF);
+  std::pair<MCPhysReg, MachineBasicBlock::iterator> P =
+      findSurvivorBackwards(*MRI, MBBI, To, LiveUnits, AllocationOrder);
+  MCPhysReg Reg = P.first;
+  MachineBasicBlock::iterator SpillBefore = P.second;
+  assert(Reg != 0 && "No register left to scavenge!");
+  // Found an available register?
+  if (SpillBefore != MBB.end()) {
+    MachineBasicBlock::iterator ReloadAfter =
+      RestoreAfter ? std::next(MBBI) : MBBI;
+    MachineBasicBlock::iterator ReloadBefore = std::next(ReloadAfter);
+    DEBUG(dbgs() << "Reload before: " << *ReloadBefore << '\n');
+    ScavengedInfo &Scavenged = spill(Reg, RC, SPAdj, SpillBefore, ReloadBefore);
+    Scavenged.Restore = &*std::prev(SpillBefore);
+    LiveUnits.removeReg(Reg);
+    DEBUG(dbgs() << "Scavenged register with spill: " << PrintReg(Reg, TRI)
+          << " until " << *SpillBefore);
+  } else {
+    DEBUG(dbgs() << "Scavenged free register: " << PrintReg(Reg, TRI) << '\n');
+  }
+  return Reg;
+}
 
-      for (const MachineOperand &MO : MI.operands()) {
+/// Allocate a register for the virtual register \p VReg. The last use of
+/// \p VReg is around the current position of the register scavenger \p RS.
+/// \p ReserveAfter controls whether the scavenged register needs to be reserved
+/// after the current instruction, otherwise it will only be reserved before the
+/// current instruction.
+static unsigned scavengeVReg(MachineRegisterInfo &MRI, RegScavenger &RS,
+                             unsigned VReg, bool ReserveAfter) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+#ifndef NDEBUG
+  // Verify that all definitions and uses are in the same basic block.
+  const MachineBasicBlock *CommonMBB = nullptr;
+  // Real definition for the reg, re-definitions are not considered.
+  const MachineInstr *RealDef = nullptr;
+  for (MachineOperand &MO : MRI.reg_nodbg_operands(VReg)) {
+    MachineBasicBlock *MBB = MO.getParent()->getParent();
+    if (CommonMBB == nullptr)
+      CommonMBB = MBB;
+    assert(MBB == CommonMBB && "All defs+uses must be in the same basic block");
+    if (MO.isDef()) {
+      const MachineInstr &MI = *MO.getParent();
+      if (!MI.readsRegister(VReg, &TRI)) {
+        assert((!RealDef || RealDef == &MI) &&
+               "Can have at most one definition which is not a redefinition");
+        RealDef = &MI;
+      }
+    }
+  }
+  assert(RealDef != nullptr && "Must have at least 1 Def");
+#endif
+
+  // We should only have one definition of the register. However to accomodate
+  // the requirements of two address code we also allow definitions in
+  // subsequent instructions provided they also read the register. That way
+  // we get a single contiguous lifetime.
+  //
+  // Definitions in MRI.def_begin() are unordered, search for the first.
+  MachineRegisterInfo::def_iterator FirstDef =
+    std::find_if(MRI.def_begin(VReg), MRI.def_end(),
+                 [VReg, &TRI](const MachineOperand &MO) {
+      return !MO.getParent()->readsRegister(VReg, &TRI);
+    });
+  assert(FirstDef != MRI.def_end() &&
+         "Must have one definition that does not redefine vreg");
+  MachineInstr &DefMI = *FirstDef->getParent();
+
+  // The register scavenger will report a free register inserting an emergency
+  // spill/reload if necessary.
+  int SPAdj = 0;
+  const TargetRegisterClass &RC = *MRI.getRegClass(VReg);
+  unsigned SReg = RS.scavengeRegisterBackwards(RC, DefMI.getIterator(),
+                                               ReserveAfter, SPAdj);
+  MRI.replaceRegWith(VReg, SReg);
+  ++NumScavengedRegs;
+  return SReg;
+}
+
+/// Allocate (scavenge) vregs inside a single basic block.
+/// Returns true if the target spill callback created new vregs and a 2nd pass
+/// is necessary.
+static bool scavengeFrameVirtualRegsInBlock(MachineRegisterInfo &MRI,
+                                            RegScavenger &RS,
+                                            MachineBasicBlock &MBB) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  RS.enterBasicBlockEnd(MBB);
+
+  unsigned InitialNumVirtRegs = MRI.getNumVirtRegs();
+  bool NextInstructionReadsVReg = false;
+  for (MachineBasicBlock::iterator I = MBB.end(); I != MBB.begin(); ) {
+    --I;
+    // Move RegScavenger to the position between *I and *std::next(I).
+    RS.backward(I);
+
+    // Look for unassigned vregs in the uses of *std::next(I).
+    if (NextInstructionReadsVReg) {
+      MachineBasicBlock::iterator N = std::next(I);
+      const MachineInstr &NMI = *N;
+      for (const MachineOperand &MO : NMI.operands()) {
         if (!MO.isReg())
           continue;
         unsigned Reg = MO.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        // We only care about virtual registers and ignore virtual registers
+        // created by the target callbacks in the process (those will be handled
+        // in a scavenging round).
+        if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+            TargetRegisterInfo::virtReg2Index(Reg) >= InitialNumVirtRegs)
+          continue;
+        if (!MO.readsReg())
           continue;
 
-        // When we first encounter a new virtual register, it
-        // must be a definition.
-        assert(MO.isDef() && "frame index virtual missing def!");
-        // Scavenge a new scratch register
-        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-        unsigned ScratchReg = RS.scavengeRegister(RC, J, SPAdj);
+        unsigned SReg = scavengeVReg(MRI, RS, Reg, true);
+        N->addRegisterKilled(SReg, &TRI, false);
+        RS.setRegUsed(SReg);
+      }
+    }
 
-        ++NumScavengedRegs;
+    // Look for unassigned vregs in the defs of *I.
+    NextInstructionReadsVReg = false;
+    const MachineInstr &MI = *I;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      // Only vregs, no newly created vregs (see above).
+      if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+          TargetRegisterInfo::virtReg2Index(Reg) >= InitialNumVirtRegs)
+        continue;
+      // We have to look at all operands anyway so we can precalculate here
+      // whether there is a reading operand. This allows use to skip the use
+      // step in the next iteration if there was none.
+      assert(!MO.isInternalRead() && "Cannot assign inside bundles");
+      assert((!MO.isUndef() || MO.isDef()) && "Cannot handle undef uses");
+      if (MO.readsReg()) {
+        NextInstructionReadsVReg = true;
+      }
+      if (MO.isDef()) {
+        unsigned SReg = scavengeVReg(MRI, RS, Reg, false);
+        I->addRegisterDead(SReg, &TRI, false);
+      }
+    }
+  }
+#ifndef NDEBUG
+  for (const MachineOperand &MO : MBB.front().operands()) {
+    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+    assert(!MO.isInternalRead() && "Cannot assign inside bundles");
+    assert((!MO.isUndef() || MO.isDef()) && "Cannot handle undef uses");
+    assert(!MO.readsReg() && "Vreg use in first instruction not allowed");
+  }
+#endif
 
-        // Replace this reference to the virtual register with the
-        // scratch register.
-        assert(ScratchReg && "Missing scratch register!");
-        MRI.replaceRegWith(Reg, ScratchReg);
+  return MRI.getNumVirtRegs() != InitialNumVirtRegs;
+}
 
-        // Because this instruction was processed by the RS before this
-        // register was allocated, make sure that the RS now records the
-        // register as being used.
-        RS.setRegUsed(ScratchReg);
-      }
+void llvm::scavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger &RS) {
+  // FIXME: Iterating over the instruction stream is unnecessary. We can simply
+  // iterate over the vreg use list, which at this point only contains machine
+  // operands for which eliminateFrameIndex need a new scratch reg.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  // Shortcut.
+  if (MRI.getNumVirtRegs() == 0) {
+    MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+    return;
+  }
+
+  // Run through the instructions and find any virtual registers.
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.empty())
+      continue;
 
-      // If the scavenger needed to use one of its spill slots, the
-      // spill code will have been inserted in between I and J. This is a
-      // problem because we need the spill code before I: Move I to just
-      // prior to J.
-      if (I != std::prev(J)) {
-        MBB.splice(J, &MBB, I);
-
-        // Before we move I, we need to prepare the RS to visit I again.
-        // Specifically, RS will assert if it sees uses of registers that
-        // it believes are undefined. Because we have already processed
-        // register kills in I, when it visits I again, it will believe that
-        // those registers are undefined. To avoid this situation, unprocess
-        // the instruction I.
-        assert(RS.getCurrentPosition() == I &&
-          "The register scavenger has an unexpected position");
-        I = P;
-        RS.unprocess(P);
-      } else
-        ++I;
+    bool Again = scavengeFrameVirtualRegsInBlock(MRI, RS, MBB);
+    if (Again) {
+      DEBUG(dbgs() << "Warning: Required two scavenging passes for block "
+            << MBB.getName() << '\n');
+      Again = scavengeFrameVirtualRegsInBlock(MRI, RS, MBB);
+      // The target required a 2nd run (because it created new vregs while
+      // spilling). Refuse to do another pass to keep compiletime in check.
+      if (Again)
+        report_fatal_error("Incomplete scavenging after 2nd pass");
     }
   }
 
diff --git a/lib/CodeGen/RegisterUsageInfo.cpp b/lib/CodeGen/RegisterUsageInfo.cpp
index d7a3ac0808230..30757f070cadb 100644
--- a/lib/CodeGen/RegisterUsageInfo.cpp
+++ b/lib/CodeGen/RegisterUsageInfo.cpp
@@ -1,4 +1,4 @@
-//===- RegisterUsageInfo.cpp - Register Usage Informartion Storage --------===//
+//===- RegisterUsageInfo.cpp - Register Usage Information Storage ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -38,7 +38,7 @@ static cl::opt<bool> DumpRegUsage(
     cl::desc("print register usage details collected for analysis."));
 
 INITIALIZE_PASS(PhysicalRegisterUsageInfo, "reg-usage-info",
-                "Register Usage Informartion Stroage", false, true)
+                "Register Usage Information Storage", false, true)
 
 char PhysicalRegisterUsageInfo::ID = 0;
 
diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
index a668ddb7389f1..ae9c5adb03979 100644
--- a/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_library(LLVMSelectionDAG
   ScheduleDAGVLIW.cpp
   SelectionDAGBuilder.cpp
   SelectionDAG.cpp
+  SelectionDAGAddressAnalysis.cpp
   SelectionDAGDumper.cpp
   SelectionDAGISel.cpp
   SelectionDAGPrinter.cpp
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2d4422d94a172..d02dcb6f4439b 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -469,7 +470,8 @@ namespace {
     /// \return True if a merged store was created.
     bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
                                          EVT MemVT, unsigned NumStores,
-                                         bool IsConstantSrc, bool UseVector);
+                                         bool IsConstantSrc, bool UseVector,
+                                         bool UseTrunc);
 
     /// This is a helper function for MergeConsecutiveStores.
     /// Stores that may be merged are placed in StoreNodes.
@@ -2549,14 +2551,14 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
   // fold (mul x, 0) -> 0
-  if (N1IsConst && ConstValue1 == 0)
+  if (N1IsConst && ConstValue1.isNullValue())
     return N1;
   // We require a splat of the entire scalar bit width for non-contiguous
   // bit patterns.
   bool IsFullSplat =
     ConstValue1.getBitWidth() == VT.getScalarSizeInBits();
   // fold (mul x, 1) -> x
-  if (N1IsConst && ConstValue1 == 1 && IsFullSplat)
+  if (N1IsConst && ConstValue1.isOneValue() && IsFullSplat)
     return N0;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
@@ -3685,7 +3687,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   // fold (and (or x, C), D) -> D if (C & D) == D
   if (N1C && N0.getOpcode() == ISD::OR)
     if (ConstantSDNode *ORI = isConstOrConstSplat(N0.getOperand(1)))
-      if ((ORI->getAPIntValue() & N1C->getAPIntValue()) == N1C->getAPIntValue())
+      if (N1C->getAPIntValue().isSubsetOf(ORI->getAPIntValue()))
         return N1;
   // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
   if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
@@ -4694,110 +4696,6 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 }
 
 namespace {
-/// Helper struct to parse and store a memory address as base + index + offset.
-/// We ignore sign extensions when it is safe to do so.
-/// The following two expressions are not equivalent. To differentiate we need
-/// to store whether there was a sign extension involved in the index
-/// computation.
-///  (load (i64 add (i64 copyfromreg %c)
-///                 (i64 signextend (add (i8 load %index)
-///                                      (i8 1))))
-/// vs
-///
-/// (load (i64 add (i64 copyfromreg %c)
-///                (i64 signextend (i32 add (i32 signextend (i8 load %index))
-///                                         (i32 1)))))
-struct BaseIndexOffset {
-  SDValue Base;
-  SDValue Index;
-  int64_t Offset;
-  bool IsIndexSignExt;
-
-  BaseIndexOffset() : Offset(0), IsIndexSignExt(false) {}
-
-  BaseIndexOffset(SDValue Base, SDValue Index, int64_t Offset,
-                  bool IsIndexSignExt) :
-    Base(Base), Index(Index), Offset(Offset), IsIndexSignExt(IsIndexSignExt) {}
-
-  bool equalBaseIndex(const BaseIndexOffset &Other) {
-    return Other.Base == Base && Other.Index == Index &&
-      Other.IsIndexSignExt == IsIndexSignExt;
-  }
-
-  /// Parses tree in Ptr for base, index, offset addresses.
-  static BaseIndexOffset match(SDValue Ptr, SelectionDAG &DAG,
-                               int64_t PartialOffset = 0) {
-    bool IsIndexSignExt = false;
-
-    // Split up a folded GlobalAddress+Offset into its component parts.
-    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Ptr))
-      if (GA->getOpcode() == ISD::GlobalAddress && GA->getOffset() != 0) {
-        return BaseIndexOffset(DAG.getGlobalAddress(GA->getGlobal(),
-                                                    SDLoc(GA),
-                                                    GA->getValueType(0),
-                                                    /*Offset=*/PartialOffset,
-                                                    /*isTargetGA=*/false,
-                                                    GA->getTargetFlags()),
-                               SDValue(),
-                               GA->getOffset(),
-                               IsIndexSignExt);
-      }
-
-    // We only can pattern match BASE + INDEX + OFFSET. If Ptr is not an ADD
-    // instruction, then it could be just the BASE or everything else we don't
-    // know how to handle. Just use Ptr as BASE and give up.
-    if (Ptr->getOpcode() != ISD::ADD)
-      return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);
-
-    // We know that we have at least an ADD instruction. Try to pattern match
-    // the simple case of BASE + OFFSET.
-    if (isa<ConstantSDNode>(Ptr->getOperand(1))) {
-      int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue();
-      return match(Ptr->getOperand(0), DAG, Offset + PartialOffset);
-    }
-
-    // Inside a loop the current BASE pointer is calculated using an ADD and a
-    // MUL instruction. In this case Ptr is the actual BASE pointer.
-    // (i64 add (i64 %array_ptr)
-    //          (i64 mul (i64 %induction_var)
-    //                   (i64 %element_size)))
-    if (Ptr->getOperand(1)->getOpcode() == ISD::MUL)
-      return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);
-
-    // Look at Base + Index + Offset cases.
-    SDValue Base = Ptr->getOperand(0);
-    SDValue IndexOffset = Ptr->getOperand(1);
-
-    // Skip signextends.
-    if (IndexOffset->getOpcode() == ISD::SIGN_EXTEND) {
-      IndexOffset = IndexOffset->getOperand(0);
-      IsIndexSignExt = true;
-    }
-
-    // Either the case of Base + Index (no offset) or something else.
-    if (IndexOffset->getOpcode() != ISD::ADD)
-      return BaseIndexOffset(Base, IndexOffset, PartialOffset, IsIndexSignExt);
-
-    // Now we have the case of Base + Index + offset.
-    SDValue Index = IndexOffset->getOperand(0);
-    SDValue Offset = IndexOffset->getOperand(1);
-
-    if (!isa<ConstantSDNode>(Offset))
-      return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);
-
-    // Ignore signextends.
-    if (Index->getOpcode() == ISD::SIGN_EXTEND) {
-      Index = Index->getOperand(0);
-      IsIndexSignExt = true;
-    } else IsIndexSignExt = false;
-
-    int64_t Off = cast<ConstantSDNode>(Offset)->getSExtValue();
-    return BaseIndexOffset(Base, Index, Off + PartialOffset, IsIndexSignExt);
-  }
-};
-} // namespace
-
-namespace {
 /// Represents known origin of an individual byte in load combine pattern. The
 /// value of the byte is either constant zero or comes from memory.
 struct ByteProvider {
@@ -5017,14 +4915,15 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
       return SDValue();
 
     // Loads must share the same base address
-    BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG);
+    BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr());
+    int64_t ByteOffsetFromBase = 0;
     if (!Base)
       Base = Ptr;
-    else if (!Base->equalBaseIndex(Ptr))
+    else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
       return SDValue();
 
     // Calculate the offset of the current byte from the base address
-    int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset(*P);
+    ByteOffsetFromBase += MemoryByteOffset(*P);
     ByteOffsets[i] = ByteOffsetFromBase;
 
     // Remember the first byte load
@@ -12378,8 +12277,8 @@ SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
 }
 
 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
-                  SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT,
-                  unsigned NumStores, bool IsConstantSrc, bool UseVector) {
+    SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
+    bool IsConstantSrc, bool UseVector, bool UseTrunc) {
   // Make sure we have something to merge.
   if (NumStores < 2)
     return false;
@@ -12464,7 +12363,7 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
 
   // make sure we use trunc store if it's necessary to be legal.
   SDValue NewStore;
-  if (TLI.isTypeLegal(StoredVal.getValueType())) {
+  if (UseVector || !UseTrunc) {
     NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
                             FirstInChain->getPointerInfo(),
                             FirstInChain->getAlignment());
@@ -12495,15 +12394,15 @@ void DAGCombiner::getStoreMergeCandidates(
     StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
-  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr());
   EVT MemVT = St->getMemoryVT();
 
   // We must have a base and an offset.
-  if (!BasePtr.Base.getNode())
+  if (!BasePtr.getBase().getNode())
     return;
 
   // Do not handle stores to undef base pointers.
-  if (BasePtr.Base.isUndef())
+  if (BasePtr.getBase().isUndef())
     return;
 
   bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) ||
@@ -12515,10 +12414,11 @@ void DAGCombiner::getStoreMergeCandidates(
   BaseIndexOffset LBasePtr;
   // Match on loadbaseptr if relevant.
   if (IsLoadSrc)
-    LBasePtr = BaseIndexOffset::match(
-        cast<LoadSDNode>(St->getValue())->getBasePtr(), DAG);
+    LBasePtr =
+        BaseIndexOffset::match(cast<LoadSDNode>(St->getValue())->getBasePtr());
 
-  auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr) -> bool {
+  auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
+                            int64_t &Offset) -> bool {
     if (Other->isVolatile() || Other->isIndexed())
       return false;
     // We can merge constant floats to equivalent integers
@@ -12529,8 +12429,8 @@ void DAGCombiner::getStoreMergeCandidates(
     if (IsLoadSrc) {
       // The Load's Base Ptr must also match
       if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Other->getValue())) {
-        auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG);
-        if (!(LBasePtr.equalBaseIndex(LPtr)))
+        auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr());
+        if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
           return false;
       } else
         return false;
@@ -12543,8 +12443,8 @@ void DAGCombiner::getStoreMergeCandidates(
       if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
             Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR))
         return false;
-    Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);
-    return (Ptr.equalBaseIndex(BasePtr));
+    Ptr = BaseIndexOffset::match(Other->getBasePtr());
+    return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
   };
   // We looking for a root node which is an ancestor to all mergable
   // stores. We search up through a load, to our root and then down
@@ -12572,16 +12472,18 @@ void DAGCombiner::getStoreMergeCandidates(
           if (I2.getOperandNo() == 0)
             if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) {
               BaseIndexOffset Ptr;
-              if (CandidateMatch(OtherST, Ptr))
-                StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset));
+              int64_t PtrDiff;
+              if (CandidateMatch(OtherST, Ptr, PtrDiff))
+                StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
             }
   } else
     for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
       if (I.getOperandNo() == 0)
         if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
           BaseIndexOffset Ptr;
-          if (CandidateMatch(OtherST, Ptr))
-            StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset));
+          int64_t PtrDiff;
+          if (CandidateMatch(OtherST, Ptr, PtrDiff))
+            StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
         }
 }
 
@@ -12721,8 +12623,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
       unsigned FirstStoreAS = FirstInChain->getAddressSpace();
       unsigned FirstStoreAlign = FirstInChain->getAlignment();
-      unsigned LastLegalType = 0;
-      unsigned LastLegalVectorType = 0;
+      unsigned LastLegalType = 1;
+      unsigned LastLegalVectorType = 1;
+      bool LastIntegerTrunc = false;
       bool NonZero = false;
       for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
         StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
@@ -12747,6 +12650,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
             TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
                                    FirstStoreAlign, &IsFast) &&
             IsFast) {
+          LastIntegerTrunc = false;
           LastLegalType = i + 1;
           // Or check whether a truncstore is legal.
         } else if (TLI.getTypeAction(Context, StoreTy) ==
@@ -12758,6 +12662,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
               TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
                                      FirstStoreAS, FirstStoreAlign, &IsFast) &&
               IsFast) {
+            LastIntegerTrunc = true;
             LastLegalType = i + 1;
           }
         }
@@ -12787,8 +12692,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
       unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
 
-      bool Merged = MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
-                                                    true, UseVector);
+      bool Merged = MergeStoresOfConstantsOrVecElts(
+          StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc);
       if (!Merged) {
         StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
         continue;
@@ -12836,7 +12741,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       }
 
       bool Merged = MergeStoresOfConstantsOrVecElts(
-          StoreNodes, MemVT, NumStoresToMerge, false, true);
+          StoreNodes, MemVT, NumStoresToMerge, false, true, false);
       if (!Merged) {
         StoreNodes.erase(StoreNodes.begin(),
                          StoreNodes.begin() + NumStoresToMerge);
@@ -12881,11 +12786,12 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       if (Ld->getMemoryVT() != MemVT)
         break;
 
-      BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
+      BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr());
       // If this is not the first ptr that we check.
-      if (LdBasePtr.Base.getNode()) {
+      int64_t LdOffset = 0;
+      if (LdBasePtr.getBase().getNode()) {
         // The base ptr must be the same.
-        if (!LdPtr.equalBaseIndex(LdBasePtr))
+        if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
           break;
       } else {
         // Check that all other base pointers are the same as this one.
@@ -12893,7 +12799,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       }
 
       // We found a potential memory operand to merge.
-      LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset));
+      LoadNodes.push_back(MemOpLink(Ld, LdOffset));
     }
 
     if (LoadNodes.size() < 2) {
@@ -12919,10 +12825,11 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
     // Scan the memory operations on the chain and find the first
     // non-consecutive load memory address. These variables hold the index in
     // the store node array.
-    unsigned LastConsecutiveLoad = 0;
+    unsigned LastConsecutiveLoad = 1;
     // This variable refers to the size and not index in the array.
-    unsigned LastLegalVectorType = 0;
-    unsigned LastLegalIntegerType = 0;
+    unsigned LastLegalVectorType = 1;
+    unsigned LastLegalIntegerType = 1;
+    bool DoIntegerTruncate = false;
     StartAddress = LoadNodes[0].OffsetFromBase;
     SDValue FirstChain = FirstLoad->getChain();
     for (unsigned i = 1; i < LoadNodes.size(); ++i) {
@@ -12958,11 +12865,12 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
           IsFastSt &&
           TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
                                  FirstLoadAlign, &IsFastLd) &&
-          IsFastLd)
+          IsFastLd) {
         LastLegalIntegerType = i + 1;
-      // Or check whether a truncstore and extload is legal.
-      else if (TLI.getTypeAction(Context, StoreTy) ==
-               TargetLowering::TypePromoteInteger) {
+        DoIntegerTruncate = false;
+        // Or check whether a truncstore and extload is legal.
+      } else if (TLI.getTypeAction(Context, StoreTy) ==
+                 TargetLowering::TypePromoteInteger) {
         EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoreTy);
         if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
             TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy) &&
@@ -12976,8 +12884,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
             IsFastSt &&
             TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
                                    FirstLoadAlign, &IsFastLd) &&
-            IsFastLd)
+            IsFastLd) {
           LastLegalIntegerType = i + 1;
+          DoIntegerTruncate = true;
+        }
       }
     }
 
@@ -13012,17 +12922,31 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
 
     // The merged loads are required to have the same incoming chain, so
     // using the first's chain is acceptable.
-    SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
-                                  FirstLoad->getBasePtr(),
-                                  FirstLoad->getPointerInfo(), FirstLoadAlign);
 
     SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
-
     AddToWorklist(NewStoreChain.getNode());
 
-    SDValue NewStore = DAG.getStore(
-        NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
-        FirstInChain->getPointerInfo(), FirstStoreAlign);
+    SDValue NewLoad, NewStore;
+    if (UseVectorTy || !DoIntegerTruncate) {
+      NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
+                            FirstLoad->getBasePtr(),
+                            FirstLoad->getPointerInfo(), FirstLoadAlign);
+      NewStore = DAG.getStore(NewStoreChain, StoreDL, NewLoad,
+                              FirstInChain->getBasePtr(),
+                              FirstInChain->getPointerInfo(), FirstStoreAlign);
+    } else { // This must be the truncstore/extload case
+      EVT ExtendedTy =
+          TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
+      NewLoad =
+          DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(),
+                         FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
+                         JointMemOpVT, FirstLoadAlign);
+      NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
+                                   FirstInChain->getBasePtr(),
+                                   FirstInChain->getPointerInfo(), JointMemOpVT,
+                                   FirstInChain->getAlignment(),
+                                   FirstInChain->getMemOperand()->getFlags());
+    }
 
     // Transfer chain users from old loads to the new load.
     for (unsigned i = 0; i < NumElem; ++i) {
@@ -13285,7 +13209,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // Only perform this optimization before the types are legal, because we
   // don't want to perform this optimization on every DAGCombine invocation.
-  if (!LegalTypes) {
+  if ((TLI.mergeStoresAfterLegalization()) ? Level == AfterLegalizeDAG
+                                           : !LegalTypes) {
     for (;;) {
       // There can be multiple store sequences on the same chain.
       // Keep trying to merge store sequences until we are unable to do so
@@ -14035,6 +13960,11 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
         // when we start sorting the vectors by type.
         return SDValue();
       }
+    } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() &&
+               InVT1.getSizeInBits() == VT.getSizeInBits()) {
+      SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
+      ConcatOps[0] = VecIn2;
+      VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
     } else {
       // TODO: Support cases where the length mismatch isn't exactly by a
       // factor of 2.
@@ -16610,11 +16540,11 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3;
 
   // Check for BaseIndexOffset matching.
-  BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG);
-  BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG);
-  if (BasePtr0.equalBaseIndex(BasePtr1))
-    return !((BasePtr0.Offset + NumBytes0 <= BasePtr1.Offset) ||
-             (BasePtr1.Offset + NumBytes1 <= BasePtr0.Offset));
+  BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr());
+  BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr());
+  int64_t PtrDiff;
+  if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
+    return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0));
 
   // FIXME: findBaseOffset and ConstantValue/GlobalValue/FrameIndex analysis
   // modified to use BaseIndexOffset.
@@ -16821,14 +16751,14 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
-  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr());
 
   // We must have a base and an offset.
-  if (!BasePtr.Base.getNode())
+  if (!BasePtr.getBase().getNode())
     return false;
 
   // Do not handle stores to undef base pointers.
-  if (BasePtr.Base.isUndef())
+  if (BasePtr.getBase().isUndef())
     return false;
 
   SmallVector<StoreSDNode *, 8> ChainedStores;
@@ -16847,10 +16777,10 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
       break;
 
     // Find the base pointer and offset for this memory node.
-    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
+    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr());
 
     // Check that the base pointer is the same as the original one.
-    if (!Ptr.equalBaseIndex(BasePtr))
+    if (!BasePtr.equalBaseIndex(Ptr, DAG))
       break;
 
     // Walk up the chain to find the next store node, ignoring any
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 15e87b7af18dc..873b2bd48f1e0 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3530,17 +3530,24 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
         LC = RTLIB::MUL_I128;
       assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
 
-      // The high part is obtained by SRA'ing all but one of the bits of low
-      // part.
-      unsigned LoSize = VT.getSizeInBits();
-      SDValue HiLHS =
-          DAG.getNode(ISD::SRA, dl, VT, LHS,
-                      DAG.getConstant(LoSize - 1, dl,
-                                      TLI.getPointerTy(DAG.getDataLayout())));
-      SDValue HiRHS =
-          DAG.getNode(ISD::SRA, dl, VT, RHS,
-                      DAG.getConstant(LoSize - 1, dl,
-                                      TLI.getPointerTy(DAG.getDataLayout())));
+      SDValue HiLHS;
+      SDValue HiRHS;
+      if (isSigned) {
+        // The high part is obtained by SRA'ing all but one of the bits of low
+        // part.
+        unsigned LoSize = VT.getSizeInBits();
+        HiLHS =
+            DAG.getNode(ISD::SRA, dl, VT, LHS,
+                        DAG.getConstant(LoSize - 1, dl,
+                                        TLI.getPointerTy(DAG.getDataLayout())));
+        HiRHS =
+            DAG.getNode(ISD::SRA, dl, VT, RHS,
+                        DAG.getConstant(LoSize - 1, dl,
+                                        TLI.getPointerTy(DAG.getDataLayout())));
+      } else {
+          HiLHS = DAG.getConstant(0, dl, VT);
+          HiRHS = DAG.getConstant(0, dl, VT);
+      }
 
       // Here we're passing the 2 arguments explicitly as 4 arguments that are
       // pre-lowered to the correct types. This all depends upon WideVT not
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a3ba52a148ee4..75fec7bd1d485 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -615,9 +615,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
   SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, LHS, RHS,
                               N->getOperand(2));
 
-  assert(NVT.bitsLE(SVT) && "Integer type overpromoted?");
   // Convert to the expected type.
-  return DAG.getNode(ISD::TRUNCATE, dl, NVT, SetCC);
+  return DAG.getSExtOrTrunc(SetCC, dl, NVT);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 593efc5121f90..70b1fa77a0991 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1861,28 +1861,68 @@ static int checkSpecialNodes(const SUnit *left, const SUnit *right) {
 /// Smaller number is the higher priority.
 static unsigned
 CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
-  unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum];
-  if (SethiUllmanNumber != 0)
-    return SethiUllmanNumber;
-
-  unsigned Extra = 0;
-  for (const SDep &Pred : SU->Preds) {
-    if (Pred.isCtrl()) continue;  // ignore chain preds
-    SUnit *PredSU = Pred.getSUnit();
-    unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers);
-    if (PredSethiUllman > SethiUllmanNumber) {
-      SethiUllmanNumber = PredSethiUllman;
-      Extra = 0;
-    } else if (PredSethiUllman == SethiUllmanNumber)
-      ++Extra;
-  }
+  if (SUNumbers[SU->NodeNum] != 0)
+    return SUNumbers[SU->NodeNum];
+
+  // Use WorkList to avoid stack overflow on excessively large IRs.
+  struct WorkState {
+    WorkState(const SUnit *SU) : SU(SU) {}
+    const SUnit *SU;
+    unsigned PredsProcessed = 0;
+  };
 
-  SethiUllmanNumber += Extra;
+  SmallVector<WorkState, 16> WorkList;
+  WorkList.push_back(SU);
+  while (!WorkList.empty()) {
+    auto &Temp = WorkList.back();
+    auto *TempSU = Temp.SU;
+    bool AllPredsKnown = true;
+    // Try to find a non-evaluated pred and push it into the processing stack.
+    for (unsigned P = Temp.PredsProcessed; P < TempSU->Preds.size(); ++P) {
+      auto &Pred = TempSU->Preds[P];
+      if (Pred.isCtrl()) continue;  // ignore chain preds
+      SUnit *PredSU = Pred.getSUnit();
+      if (SUNumbers[PredSU->NodeNum] == 0) {
+#ifndef NDEBUG
+        // In debug mode, check that we don't have such element in the stack.
+        for (auto It : WorkList)
+          assert(It.SU != PredSU && "Trying to push an element twice?");
+#endif
+        // Next time start processing this one starting from the next pred.
+        Temp.PredsProcessed = P + 1;
+        WorkList.push_back(PredSU);
+        AllPredsKnown = false;
+        break;
+      }
+    }
 
-  if (SethiUllmanNumber == 0)
-    SethiUllmanNumber = 1;
+    if (!AllPredsKnown)
+      continue;
 
-  return SethiUllmanNumber;
+    // Once all preds are known, we can calculate the answer for this one.
+    unsigned SethiUllmanNumber = 0;
+    unsigned Extra = 0;
+    for (const SDep &Pred : TempSU->Preds) {
+      if (Pred.isCtrl()) continue;  // ignore chain preds
+      SUnit *PredSU = Pred.getSUnit();
+      unsigned PredSethiUllman = SUNumbers[PredSU->NodeNum];
+      assert(PredSethiUllman > 0 && "We should have evaluated this pred!");
+      if (PredSethiUllman > SethiUllmanNumber) {
+        SethiUllmanNumber = PredSethiUllman;
+        Extra = 0;
+      } else if (PredSethiUllman == SethiUllmanNumber)
+        ++Extra;
+    }
+
+    SethiUllmanNumber += Extra;
+    if (SethiUllmanNumber == 0)
+      SethiUllmanNumber = 1;
+    SUNumbers[TempSU->NodeNum] = SethiUllmanNumber;
+    WorkList.pop_back();
+  }
+
+  assert(SUNumbers[SU->NodeNum] > 0 && "SethiUllman should never be zero!");
+  return SUNumbers[SU->NodeNum];
 }
 
 /// CalculateSethiUllmanNumbers - Calculate Sethi-Ullman numbers of all
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 7abdc76cb004f..98553152117d1 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4897,6 +4897,8 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   // TODO: In the AlwaysInline case, if the size is big then generate a loop
   // rather than maybe a humongous number of loads and stores.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
+  LLVMContext &C = *DAG.getContext();
   std::vector<EVT> MemOps;
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
@@ -4923,15 +4925,15 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     return SDValue();
 
   if (DstAlignCanChange) {
-    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
+    Type *Ty = MemOps[0].getTypeForEVT(C);
+    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
       while (NewAlign > Align &&
-             DAG.getDataLayout().exceedsNaturalStackAlignment(NewAlign))
+             DL.exceedsNaturalStackAlignment(NewAlign))
           NewAlign /= 2;
 
     if (NewAlign > Align) {
@@ -4991,12 +4993,19 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
       // thing to do is generate a LoadExt/StoreTrunc pair.  These simplify
       // to Load/Store if NVT==VT.
       // FIXME does the case above also need this?
-      EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+      EVT NVT = TLI.getTypeToTransformTo(C, VT);
       assert(NVT.bitsGE(VT));
+
+      bool isDereferenceable =
+        SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+      MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
+      if (isDereferenceable)
+        SrcMMOFlags |= MachineMemOperand::MODereferenceable;
+
       Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
                              DAG.getMemBasePlusOffset(Src, SrcOff, dl),
                              SrcPtrInfo.getWithOffset(SrcOff), VT,
-                             MinAlign(SrcAlign, SrcOff), MMOFlags);
+                             MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
       OutChains.push_back(Value.getValue(1));
       Store = DAG.getTruncStore(
           Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
@@ -5024,6 +5033,8 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   // Expand memmove to a series of load and store ops if the size operand falls
   // below a certain threshold.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
+  LLVMContext &C = *DAG.getContext();
   std::vector<EVT> MemOps;
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
@@ -5046,8 +5057,8 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     return SDValue();
 
   if (DstAlignCanChange) {
-    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
+    Type *Ty = MemOps[0].getTypeForEVT(C);
+    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
@@ -5068,9 +5079,15 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Value;
 
+    bool isDereferenceable =
+      SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+    MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
+    if (isDereferenceable)
+      SrcMMOFlags |= MachineMemOperand::MODereferenceable;
+
     Value =
         DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
-                    SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, MMOFlags);
+                    SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, SrcMMOFlags);
     LoadValues.push_back(Value);
     LoadChains.push_back(Value.getValue(1));
     SrcOff += VTSize;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
new file mode 100644
index 0000000000000..d2e0dbbf88ecd
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -0,0 +1,95 @@
+//===-- llvm/CodeGen/SelectionDAGAddressAnalysis.cpp ------- DAG Address
+//Analysis ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+
+bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
+                                     const SelectionDAG &DAG, int64_t &Off) {
+  // Obvious equivalent
+  Off = Other.Offset - Offset;
+  if (Other.Base == Base && Other.Index == Index &&
+      Other.IsIndexSignExt == IsIndexSignExt)
+    return true;
+
+  // Match GlobalAddresses
+  if (Index == Other.Index)
+    if (GlobalAddressSDNode *A = dyn_cast<GlobalAddressSDNode>(Base))
+      if (GlobalAddressSDNode *B = dyn_cast<GlobalAddressSDNode>(Other.Base))
+        if (A->getGlobal() == B->getGlobal()) {
+          Off += B->getOffset() - A->getOffset();
+          return true;
+        }
+
+  // TODO: we should be able to add FrameIndex analysis improvements here.
+
+  return false;
+}
+
+/// Parses tree in Ptr for base, index, offset addresses.
+BaseIndexOffset BaseIndexOffset::match(SDValue Ptr) {
+  // (((B + I*M) + c)) + c ...
+  SDValue Base = Ptr;
+  SDValue Index = SDValue();
+  int64_t Offset = 0;
+  bool IsIndexSignExt = false;
+
+  // Consume constant adds
+  while (Base->getOpcode() == ISD::ADD &&
+         isa<ConstantSDNode>(Base->getOperand(1))) {
+    int64_t POffset = cast<ConstantSDNode>(Base->getOperand(1))->getSExtValue();
+    Offset += POffset;
+    Base = Base->getOperand(0);
+  }
+
+  if (Base->getOpcode() == ISD::ADD) {
+    // TODO: The following code appears to be needless as it just
+    //       bails on some Ptrs early, reducing the cases where we
+    //       find equivalence. We should be able to remove this.
+    // Inside a loop the current BASE pointer is calculated using an ADD and a
+    // MUL instruction. In this case Base is the actual BASE pointer.
+    // (i64 add (i64 %array_ptr)
+    //          (i64 mul (i64 %induction_var)
+    //                   (i64 %element_size)))
+    if (Base->getOperand(1)->getOpcode() == ISD::MUL)
+      return BaseIndexOffset(Base, Index, Offset, IsIndexSignExt);
+
+    // Look at Base + Index + Offset cases.
+    Index = Base->getOperand(1);
+    SDValue PotentialBase = Base->getOperand(0);
+
+    // Skip signextends.
+    if (Index->getOpcode() == ISD::SIGN_EXTEND) {
+      Index = Index->getOperand(0);
+      IsIndexSignExt = true;
+    }
+
+    // Check if Index Offset pattern
+    if (Index->getOpcode() != ISD::ADD ||
+        !isa<ConstantSDNode>(Index->getOperand(1)))
+      return BaseIndexOffset(PotentialBase, Index, Offset, IsIndexSignExt);
+
+    Offset += cast<ConstantSDNode>(Index->getOperand(1))->getSExtValue();
+    Index = Index->getOperand(0);
+    if (Index->getOpcode() == ISD::SIGN_EXTEND) {
+      Index = Index->getOperand(0);
+      IsIndexSignExt = true;
+    } else
+      IsIndexSignExt = false;
+    Base = PotentialBase;
+  }
+  return BaseIndexOffset(Base, Index, Offset, IsIndexSignExt);
+}
+} // end namespace llvm
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index dcccd17bb98ea..f711ca71f79fe 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -337,12 +337,13 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
 /// SplitCriticalSideEffectEdges - Look for critical edges with a PHI value that
 /// may trap on it.  In this case we have to split the edge so that the path
 /// through the predecessor block that doesn't go to the phi block doesn't
-/// execute the possibly trapping instruction. If available, we pass a
-/// dominator tree to be updated when we split critical edges. This is because
-/// SelectionDAGISel preserves the DominatorTree.
+/// execute the possibly trapping instruction. If available, we pass domtree
+/// and loop info to be updated when we split critical edges. This is because
+/// SelectionDAGISel preserves these analyses.
 /// This is required for correctness, so it must be done at -O0.
 ///
-static void SplitCriticalSideEffectEdges(Function &Fn, DominatorTree *DT) {
+static void SplitCriticalSideEffectEdges(Function &Fn, DominatorTree *DT,
+                                         LoopInfo *LI) {
   // Loop for blocks with phi nodes.
   for (BasicBlock &BB : Fn) {
     PHINode *PN = dyn_cast<PHINode>(BB.begin());
@@ -368,7 +369,7 @@ static void SplitCriticalSideEffectEdges(Function &Fn, DominatorTree *DT) {
         // Okay, we have to split this edge.
         SplitCriticalEdge(
             Pred->getTerminator(), GetSuccessorNumber(Pred, &BB),
-            CriticalEdgeSplittingOptions(DT).setMergeIdenticalEdges());
+            CriticalEdgeSplittingOptions(DT, LI).setMergeIdenticalEdges());
         goto ReprocessBlock;
       }
   }
@@ -406,10 +407,12 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   ORE = make_unique<OptimizationRemarkEmitter>(&Fn);
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+  LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
 
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
-  SplitCriticalSideEffectEdges(const_cast<Function &>(Fn), DT);
+  SplitCriticalSideEffectEdges(const_cast<Function &>(Fn), DT, LI);
 
   CurDAG->init(*MF, *ORE);
   FuncInfo->set(Fn, *MF, CurDAG);
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cfda0fffd031a..8652df7bbd706 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -365,10 +365,10 @@ bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
 
     // If this is a 'not' op, don't touch it because that's a canonical form.
     const APInt &C = Op1C->getAPIntValue();
-    if (Opcode == ISD::XOR && (C | ~Demanded).isAllOnesValue())
+    if (Opcode == ISD::XOR && Demanded.isSubsetOf(C))
       return false;
 
-    if (C.intersects(~Demanded)) {
+    if (!C.isSubsetOf(Demanded)) {
       EVT VT = Op.getValueType();
       SDValue NewC = DAG.getConstant(Demanded & C, DL, VT);
       SDValue NewOp = DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC);
@@ -919,7 +919,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // always convert this into a logical shr, even if the shift amount is
     // variable.  The low bit of the shift cannot be an input sign bit unless
     // the shift amount is >= the size of the datatype, which is undefined.
-    if (NewMask == 1)
+    if (NewMask.isOneValue())
       return TLO.CombineTo(Op,
                            TLO.DAG.getNode(ISD::SRL, dl, Op.getValueType(),
                                            Op.getOperand(0), Op.getOperand(1)));
@@ -1349,7 +1349,7 @@ bool TargetLowering::isConstTrueVal(const SDNode *N) const {
   case UndefinedBooleanContent:
     return CVal[0];
   case ZeroOrOneBooleanContent:
-    return CVal == 1;
+    return CVal.isOneValue();
   case ZeroOrNegativeOneBooleanContent:
     return CVal.isAllOnesValue();
   }
@@ -1506,7 +1506,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
     // equality comparison, then we're just comparing whether X itself is
     // zero.
-    if (N0.getOpcode() == ISD::SRL && (C1 == 0 || C1 == 1) &&
+    if (N0.getOpcode() == ISD::SRL && (C1.isNullValue() || C1.isOneValue()) &&
         N0.getOperand(0).getOpcode() == ISD::CTLZ &&
         N0.getOperand(1).getOpcode() == ISD::Constant) {
       const APInt &ShAmt
@@ -1666,7 +1666,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         for (unsigned width = origWidth / 2; width>=8; width /= 2) {
           APInt newMask = APInt::getLowBitsSet(maskWidth, width);
           for (unsigned offset=0; offset<origWidth/width; offset++) {
-            if ((newMask & Mask) == Mask) {
+            if (Mask.isSubsetOf(newMask)) {
               if (DAG.getDataLayout().isLittleEndian())
                 bestOffset = (uint64_t)offset * (width/8);
               else
@@ -1785,12 +1785,12 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                                                               ExtSrcTyBits),
                                           dl, ExtDstTy),
                           Cond);
-    } else if ((N1C->isNullValue() || N1C->getAPIntValue() == 1) &&
+    } else if ((N1C->isNullValue() || N1C->isOne()) &&
                 (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
       // SETCC (SETCC), [0|1], [EQ|NE]  -> SETCC
       if (N0.getOpcode() == ISD::SETCC &&
           isTypeLegal(VT) && VT.bitsLE(N0.getValueType())) {
-        bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (N1C->getAPIntValue() != 1);
+        bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (!N1C->isOne());
         if (TrueWhenTrue)
           return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
         // Invert the condition.
@@ -1807,7 +1807,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
             N0.getOperand(0).getOpcode() == ISD::XOR &&
             N0.getOperand(1) == N0.getOperand(0).getOperand(1))) &&
           isa<ConstantSDNode>(N0.getOperand(1)) &&
-          cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue() == 1) {
+          cast<ConstantSDNode>(N0.getOperand(1))->isOne()) {
         // If this is (X^1) == 0/1, swap the RHS and eliminate the xor.  We
         // can only do this if the top bits are known zero.
         unsigned BitWidth = N0.getValueSizeInBits();
@@ -1830,7 +1830,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           return DAG.getSetCC(dl, VT, Val, N1,
                               Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
         }
-      } else if (N1C->getAPIntValue() == 1 &&
+      } else if (N1C->isOne() &&
                  (VT == MVT::i1 ||
                   getBooleanContents(N0->getValueType(0)) ==
                       ZeroOrOneBooleanContent)) {
@@ -1848,7 +1848,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         }
         if (Op0.getOpcode() == ISD::AND &&
             isa<ConstantSDNode>(Op0.getOperand(1)) &&
-            cast<ConstantSDNode>(Op0.getOperand(1))->getAPIntValue() == 1) {
+            cast<ConstantSDNode>(Op0.getOperand(1))->isOne()) {
           // If this is (X&1) == / != 1, normalize it to (X&1) != / == 0.
           if (Op0.getValueType().bitsGT(VT))
             Op0 = DAG.getNode(ISD::AND, dl, VT,
@@ -2482,7 +2482,7 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         // gcc prints these as sign extended.  Sign extend value to 64 bits
         // now; without this it would get ZExt'd later in
         // ScheduleDAGSDNodes::EmitNode, which is very generic.
-        Ops.push_back(DAG.getTargetConstant(C->getAPIntValue().getSExtValue(),
+        Ops.push_back(DAG.getTargetConstant(C->getSExtValue(),
                                             SDLoc(C), MVT::i64));
       }
       return;
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index c8537ad2f3130..eeb00a784b0d9 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===- TargetRegisterInfo.cpp - Target Register Information Implementation ===//
+//==- TargetRegisterInfo.cpp - Target Register Information Implementation --==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +11,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <utility>
 
 #define DEBUG_TYPE "target-reg-info"
 
@@ -38,7 +48,7 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
     CoveringLanes(SRICoveringLanes) {
 }
 
-TargetRegisterInfo::~TargetRegisterInfo() {}
+TargetRegisterInfo::~TargetRegisterInfo() = default;
 
 void TargetRegisterInfo::markSuperRegs(BitVector &RegisterSet, unsigned Reg)
     const {
@@ -126,7 +136,7 @@ Printable PrintVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   });
 }
 
-} // End of llvm namespace
+} // end namespace llvm
 
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
diff --git a/lib/CodeGen/TargetSubtargetInfo.cpp b/lib/CodeGen/TargetSubtargetInfo.cpp
index 82e85bab14747..f6d5bc80ddffb 100644
--- a/lib/CodeGen/TargetSubtargetInfo.cpp
+++ b/lib/CodeGen/TargetSubtargetInfo.cpp
@@ -1,4 +1,4 @@
-//===-- TargetSubtargetInfo.cpp - General Target Information ---------------==//
+//===- TargetSubtargetInfo.cpp - General Target Information ----------------==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,15 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <string>
+
 using namespace llvm;
 
-//---------------------------------------------------------------------------
-// TargetSubtargetInfo Class
-//
 TargetSubtargetInfo::TargetSubtargetInfo(
     const Triple &TT, StringRef CPU, StringRef FS,
     ArrayRef<SubtargetFeatureKV> PF, ArrayRef<SubtargetFeatureKV> PD,
@@ -29,7 +31,7 @@ TargetSubtargetInfo::TargetSubtargetInfo(
     : MCSubtargetInfo(TT, CPU, FS, PF, PD, ProcSched, WPR, WL, RA, IS, OC, FP) {
 }
 
-TargetSubtargetInfo::~TargetSubtargetInfo() {}
+TargetSubtargetInfo::~TargetSubtargetInfo() = default;
 
 bool TargetSubtargetInfo::enableAtomicExpand() const {
   return true;
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index f916695a84392..b94bb0c80c793 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -24,11 +24,10 @@ add_llvm_library(LLVMDebugInfoCodeView
   SymbolRecordMapping.cpp
   SymbolDumper.cpp
   SymbolSerializer.cpp
-  TypeDatabase.cpp
-  TypeDatabaseVisitor.cpp
   TypeDumpVisitor.cpp
   TypeIndex.cpp
   TypeIndexDiscovery.cpp
+  TypeName.cpp
   TypeRecordMapping.cpp
   TypeSerializer.cpp
   TypeStreamMerger.cpp
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index f0debd9e97023..22f166a2335d6 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -12,8 +12,6 @@
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
 #include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
diff --git a/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
index 334c5e002bbca..d69eca018e0c1 100644
--- a/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
+++ b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
@@ -53,12 +53,16 @@ DebugSubsectionRecordBuilder::DebugSubsectionRecordBuilder(
     std::shared_ptr<DebugSubsection> Subsection, CodeViewContainer Container)
     : Subsection(std::move(Subsection)), Container(Container) {}
 
+DebugSubsectionRecordBuilder::DebugSubsectionRecordBuilder(
+    const DebugSubsectionRecord &Contents, CodeViewContainer Container)
+    : Contents(Contents), Container(Container) {}
+
 uint32_t DebugSubsectionRecordBuilder::calculateSerializedLength() {
-  // The length of the entire subsection is always padded to 4 bytes, regardless
-  // of the container kind.
-  uint32_t Size = sizeof(DebugSubsectionHeader) +
-                  alignTo(Subsection->calculateSerializedSize(), 4);
-  return Size;
+  uint32_t DataSize = Subsection ? Subsection->calculateSerializedSize()
+                                 : Contents.getRecordData().getLength();
+  // The length of the entire subsection is always padded to 4 bytes,
+  // regardless of the container kind.
+  return sizeof(DebugSubsectionHeader) + alignTo(DataSize, 4);
 }
 
 Error DebugSubsectionRecordBuilder::commit(BinaryStreamWriter &Writer) const {
@@ -66,16 +70,22 @@ Error DebugSubsectionRecordBuilder::commit(BinaryStreamWriter &Writer) const {
          "Debug Subsection not properly aligned");
 
   DebugSubsectionHeader Header;
-  Header.Kind = uint32_t(Subsection->kind());
+  Header.Kind = uint32_t(Subsection ? Subsection->kind() : Contents.kind());
   // The value written into the Header's Length field is only padded to the
   // container's alignment
-  Header.Length =
-      alignTo(Subsection->calculateSerializedSize(), alignOf(Container));
+  uint32_t DataSize = Subsection ? Subsection->calculateSerializedSize()
+                                 : Contents.getRecordData().getLength();
+  Header.Length = alignTo(DataSize, alignOf(Container));
 
   if (auto EC = Writer.writeObject(Header))
     return EC;
-  if (auto EC = Subsection->commit(Writer))
-    return EC;
+  if (Subsection) {
+    if (auto EC = Subsection->commit(Writer))
+      return EC;
+  } else {
+    if (auto EC = Writer.writeStreamRef(Contents.getRecordData()))
+      return EC;
+  }
   if (auto EC = Writer.padToAlignment(4))
     return EC;
 
diff --git a/lib/DebugInfo/CodeView/EnumTables.cpp b/lib/DebugInfo/CodeView/EnumTables.cpp
index 01d8ccf2d31e8..ec00af28395e5 100644
--- a/lib/DebugInfo/CodeView/EnumTables.cpp
+++ b/lib/DebugInfo/CodeView/EnumTables.cpp
@@ -82,6 +82,13 @@ static const EnumEntry<uint16_t> RegisterNames[] = {
     CV_ENUM_CLASS_ENT(RegisterId, R15),
 };
 
+static const EnumEntry<uint32_t> PublicSymFlagNames[] = {
+    CV_ENUM_CLASS_ENT(PublicSymFlags, Code),
+    CV_ENUM_CLASS_ENT(PublicSymFlags, Function),
+    CV_ENUM_CLASS_ENT(PublicSymFlags, Managed),
+    CV_ENUM_CLASS_ENT(PublicSymFlags, MSIL),
+};
+
 static const EnumEntry<uint8_t> ProcSymFlagNames[] = {
     CV_ENUM_CLASS_ENT(ProcSymFlags, HasFP),
     CV_ENUM_CLASS_ENT(ProcSymFlags, HasIRET),
@@ -338,6 +345,9 @@ ArrayRef<EnumEntry<uint16_t>> getRegisterNames() {
   return makeArrayRef(RegisterNames);
 }
 
+ArrayRef<EnumEntry<uint32_t>> getPublicSymFlagNames() {
+  return makeArrayRef(PublicSymFlagNames);
+}
 ArrayRef<EnumEntry<uint8_t>> getProcSymFlagNames() {
   return makeArrayRef(ProcSymFlagNames);
 }
diff --git a/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index 39eb4099ce9e6..20f7e72c3af39 100644
--- a/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -11,7 +11,7 @@
 
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeName.h"
 #include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 
@@ -31,15 +31,13 @@ LazyRandomTypeCollection::LazyRandomTypeCollection(uint32_t RecordCountHint)
 LazyRandomTypeCollection::LazyRandomTypeCollection(
     const CVTypeArray &Types, uint32_t RecordCountHint,
     PartialOffsetArray PartialOffsets)
-    : Database(RecordCountHint), Types(Types), DatabaseVisitor(Database),
-      PartialOffsets(PartialOffsets) {
-  KnownOffsets.resize(Database.capacity());
+    : NameStorage(Allocator), Types(Types), PartialOffsets(PartialOffsets) {
+  Records.resize(RecordCountHint);
 }
 
 LazyRandomTypeCollection::LazyRandomTypeCollection(ArrayRef<uint8_t> Data,
                                                    uint32_t RecordCountHint)
     : LazyRandomTypeCollection(RecordCountHint) {
-  reset(Data);
 }
 
 LazyRandomTypeCollection::LazyRandomTypeCollection(StringRef Data,
@@ -52,50 +50,88 @@ LazyRandomTypeCollection::LazyRandomTypeCollection(const CVTypeArray &Types,
                                                    uint32_t NumRecords)
     : LazyRandomTypeCollection(Types, NumRecords, PartialOffsetArray()) {}
 
-void LazyRandomTypeCollection::reset(StringRef Data) {
-  reset(makeArrayRef(Data.bytes_begin(), Data.bytes_end()));
-}
-
-void LazyRandomTypeCollection::reset(ArrayRef<uint8_t> Data) {
+void LazyRandomTypeCollection::reset(StringRef Data, uint32_t RecordCountHint) {
+  Count = 0;
   PartialOffsets = PartialOffsetArray();
 
   BinaryStreamReader Reader(Data, support::little);
   error(Reader.readArray(Types, Reader.getLength()));
 
-  KnownOffsets.resize(Database.capacity());
+  // Clear and then resize, to make sure existing data gets destroyed.
+  Records.clear();
+  Records.resize(RecordCountHint);
+}
+
+void LazyRandomTypeCollection::reset(ArrayRef<uint8_t> Data,
+                                     uint32_t RecordCountHint) {
+  reset(toStringRef(Data), RecordCountHint);
+}
+
+uint32_t LazyRandomTypeCollection::getOffsetOfType(TypeIndex Index) {
+  error(ensureTypeExists(Index));
+  assert(contains(Index));
+
+  return Records[Index.toArrayIndex()].Offset;
 }
 
 CVType LazyRandomTypeCollection::getType(TypeIndex Index) {
   error(ensureTypeExists(Index));
-  return Database.getTypeRecord(Index);
+  assert(contains(Index));
+
+  return Records[Index.toArrayIndex()].Type;
 }
 
 StringRef LazyRandomTypeCollection::getTypeName(TypeIndex Index) {
-  if (!Index.isSimple()) {
-    // Try to make sure the type exists.  Even if it doesn't though, it may be
-    // because we're dumping a symbol stream with no corresponding type stream
-    // present, in which case we still want to be able to print <unknown UDT>
-    // for the type names.
-    consumeError(ensureTypeExists(Index));
+  if (Index.isNoneType() || Index.isSimple())
+    return TypeIndex::simpleTypeName(Index);
+
+  // Try to make sure the type exists.  Even if it doesn't though, it may be
+  // because we're dumping a symbol stream with no corresponding type stream
+  // present, in which case we still want to be able to print <unknown UDT>
+  // for the type names.
+  if (auto EC = ensureTypeExists(Index)) {
+    consumeError(std::move(EC));
+    return "<unknown UDT>";
   }
 
-  return Database.getTypeName(Index);
+  uint32_t I = Index.toArrayIndex();
+  ensureCapacityFor(Index);
+  if (Records[I].Name.data() == nullptr) {
+    StringRef Result = NameStorage.save(computeTypeName(*this, Index));
+    Records[I].Name = Result;
+  }
+  return Records[I].Name;
 }
 
 bool LazyRandomTypeCollection::contains(TypeIndex Index) {
-  return Database.contains(Index);
+  if (Records.size() <= Index.toArrayIndex())
+    return false;
+  if (!Records[Index.toArrayIndex()].Type.valid())
+    return false;
+  return true;
 }
 
-uint32_t LazyRandomTypeCollection::size() { return Database.size(); }
+uint32_t LazyRandomTypeCollection::size() { return Count; }
 
-uint32_t LazyRandomTypeCollection::capacity() { return Database.capacity(); }
+uint32_t LazyRandomTypeCollection::capacity() { return Records.size(); }
 
 Error LazyRandomTypeCollection::ensureTypeExists(TypeIndex TI) {
-  if (!Database.contains(TI)) {
-    if (auto EC = visitRangeForType(TI))
-      return EC;
-  }
-  return Error::success();
+  if (contains(TI))
+    return Error::success();
+
+  return visitRangeForType(TI);
+}
+
+void LazyRandomTypeCollection::ensureCapacityFor(TypeIndex Index) {
+  uint32_t MinSize = Index.toArrayIndex() + 1;
+
+  if (MinSize <= capacity())
+    return;
+
+  uint32_t NewCapacity = MinSize * 3 / 2;
+
+  assert(NewCapacity > capacity());
+  Records.resize(NewCapacity);
 }
 
 Error LazyRandomTypeCollection::visitRangeForType(TypeIndex TI) {
@@ -111,7 +147,7 @@ Error LazyRandomTypeCollection::visitRangeForType(TypeIndex TI) {
   auto Prev = std::prev(Next);
 
   TypeIndex TIB = Prev->Type;
-  if (Database.contains(TIB)) {
+  if (contains(TIB)) {
     // They've asked us to fetch a type index, but the entry we found in the
     // partial offsets array has already been visited.  Since we visit an entire
     // block every time, that means this record should have been previously
@@ -122,13 +158,12 @@ Error LazyRandomTypeCollection::visitRangeForType(TypeIndex TI) {
 
   TypeIndex TIE;
   if (Next == PartialOffsets.end()) {
-    TIE = TypeIndex::fromArrayIndex(Database.capacity());
+    TIE = TypeIndex::fromArrayIndex(capacity());
   } else {
     TIE = Next->Type;
   }
 
-  if (auto EC = visitRange(TIB, Prev->Offset, TIE))
-    return EC;
+  visitRange(TIB, Prev->Offset, TIE);
   return Error::success();
 }
 
@@ -157,34 +192,31 @@ Error LazyRandomTypeCollection::fullScanForType(TypeIndex TI) {
   assert(PartialOffsets.empty());
 
   TypeIndex CurrentTI = TypeIndex::fromArrayIndex(0);
-  uint32_t Offset = 0;
   auto Begin = Types.begin();
 
-  if (!Database.empty()) {
+  if (Count > 0) {
     // In the case of type streams which we don't know the number of records of,
     // it's possible to search for a type index triggering a full scan, but then
     // later additional records are added since we didn't know how many there
     // would be until we did a full visitation, then you try to access the new
     // type triggering another full scan.  To avoid this, we assume that if the
-    // database has some records, this must be what's going on.  So we ask the
-    // database for the largest type index less than the one we're searching for
-    // and only do the forward scan from there.
-    auto Prev = Database.largestTypeIndexLessThan(TI);
-    assert(Prev.hasValue() && "Empty database with valid types?");
-    Offset = KnownOffsets[Prev->toArrayIndex()];
-    CurrentTI = *Prev;
-    ++CurrentTI;
+    // database has some records, this must be what's going on.  We can also
+    // assume that this index must be larger than the largest type index we've
+    // visited, so we start from there and scan forward.
+    uint32_t Offset = Records[LargestTypeIndex.toArrayIndex()].Offset;
+    CurrentTI = LargestTypeIndex + 1;
     Begin = Types.at(Offset);
     ++Begin;
-    Offset = Begin.offset();
   }
 
   auto End = Types.end();
   while (Begin != End) {
-    if (auto EC = visitOneRecord(CurrentTI, Offset, *Begin))
-      return EC;
-
-    Offset += Begin.getRecordLength();
+    ensureCapacityFor(CurrentTI);
+    LargestTypeIndex = std::max(LargestTypeIndex, CurrentTI);
+    auto Idx = CurrentTI.toArrayIndex();
+    Records[Idx].Type = *Begin;
+    Records[Idx].Offset = Begin.offset();
+    ++Count;
     ++Begin;
     ++CurrentTI;
   }
@@ -194,36 +226,19 @@ Error LazyRandomTypeCollection::fullScanForType(TypeIndex TI) {
   return Error::success();
 }
 
-Error LazyRandomTypeCollection::visitRange(TypeIndex Begin,
-                                           uint32_t BeginOffset,
-                                           TypeIndex End) {
-
+void LazyRandomTypeCollection::visitRange(TypeIndex Begin, uint32_t BeginOffset,
+                                          TypeIndex End) {
   auto RI = Types.at(BeginOffset);
   assert(RI != Types.end());
 
+  ensureCapacityFor(End);
   while (Begin != End) {
-    if (auto EC = visitOneRecord(Begin, BeginOffset, *RI))
-      return EC;
-
-    BeginOffset += RI.getRecordLength();
+    LargestTypeIndex = std::max(LargestTypeIndex, Begin);
+    auto Idx = Begin.toArrayIndex();
+    Records[Idx].Type = *RI;
+    Records[Idx].Offset = RI.offset();
+    ++Count;
     ++Begin;
     ++RI;
   }
-
-  return Error::success();
-}
-
-Error LazyRandomTypeCollection::visitOneRecord(TypeIndex TI, uint32_t Offset,
-                                               CVType &Record) {
-  assert(!Database.contains(TI));
-  if (auto EC = codeview::visitTypeRecord(Record, TI, DatabaseVisitor))
-    return EC;
-  // Keep the KnownOffsets array the same size as the Database's capacity. Since
-  // we don't always know how many records are in the type stream, we need to be
-  // prepared for the database growing and receicing a type index that can't fit
-  // in our current buffer.
-  if (KnownOffsets.size() < Database.capacity())
-    KnownOffsets.resize(Database.capacity());
-  KnownOffsets[TI.toArrayIndex()] = Offset;
-  return Error::success();
 }
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
index 36abafc079edb..b9fa9b6a6ad7e 100644
--- a/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -524,7 +524,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, PublicSym32 &Public) {
   DictScope S(W, "PublicSym");
-  printTypeIndex("Type", Public.Index);
+  W.printFlags("Flags", uint32_t(Public.Flags), getPublicSymFlagNames());
   W.printNumber("Seg", Public.Segment);
   W.printNumber("Off", Public.Offset);
   W.printString("Name", Public.Name);
diff --git a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
index d731dc1b0a372..923837a45d9fc 100644
--- a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
+++ b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
@@ -361,7 +361,7 @@ Error SymbolRecordMapping::visitKnownRecord(CVSymbol &CVR,
 Error SymbolRecordMapping::visitKnownRecord(CVSymbol &CVR,
                                             PublicSym32 &Public) {
 
-  error(IO.mapInteger(Public.Index));
+  error(IO.mapEnum(Public.Flags));
   error(IO.mapInteger(Public.Offset));
   error(IO.mapInteger(Public.Segment));
   error(IO.mapStringZ(Public.Name));
diff --git a/lib/DebugInfo/CodeView/TypeDatabase.cpp b/lib/DebugInfo/CodeView/TypeDatabase.cpp
deleted file mode 100644
index 08f848b36a9d5..0000000000000
--- a/lib/DebugInfo/CodeView/TypeDatabase.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-//===- TypeDatabase.cpp --------------------------------------- *- C++ --*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-
-TypeDatabase::TypeDatabase(uint32_t Capacity) : TypeNameStorage(Allocator) {
-  CVUDTNames.resize(Capacity);
-  TypeRecords.resize(Capacity);
-  ValidRecords.resize(Capacity);
-}
-
-TypeIndex TypeDatabase::appendType(StringRef Name, const CVType &Data) {
-  LargestTypeIndex = getAppendIndex();
-  if (LargestTypeIndex.toArrayIndex() >= capacity())
-    grow();
-  recordType(Name, LargestTypeIndex, Data);
-  return LargestTypeIndex;
-}
-
-void TypeDatabase::recordType(StringRef Name, TypeIndex Index,
-                              const CVType &Data) {
-  LargestTypeIndex = empty() ? Index : std::max(Index, LargestTypeIndex);
-
-  if (LargestTypeIndex.toArrayIndex() >= capacity())
-    grow(Index);
-
-  uint32_t AI = Index.toArrayIndex();
-
-  assert(!contains(Index));
-  assert(AI < capacity());
-
-  CVUDTNames[AI] = Name;
-  TypeRecords[AI] = Data;
-  ValidRecords.set(AI);
-  ++Count;
-}
-
-/// Saves the name in a StringSet and creates a stable StringRef.
-StringRef TypeDatabase::saveTypeName(StringRef TypeName) {
-  return TypeNameStorage.save(TypeName);
-}
-
-StringRef TypeDatabase::getTypeName(TypeIndex Index) const {
-  if (Index.isNoneType() || Index.isSimple())
-    return TypeIndex::simpleTypeName(Index);
-
-  if (contains(Index))
-    return CVUDTNames[Index.toArrayIndex()];
-
-  return "<unknown UDT>";
-}
-
-const CVType &TypeDatabase::getTypeRecord(TypeIndex Index) const {
-  assert(contains(Index));
-  return TypeRecords[Index.toArrayIndex()];
-}
-
-CVType &TypeDatabase::getTypeRecord(TypeIndex Index) {
-  assert(contains(Index));
-  return TypeRecords[Index.toArrayIndex()];
-}
-
-bool TypeDatabase::contains(TypeIndex Index) const {
-  uint32_t AI = Index.toArrayIndex();
-  if (AI >= capacity())
-    return false;
-
-  return ValidRecords.test(AI);
-}
-
-uint32_t TypeDatabase::size() const { return Count; }
-
-uint32_t TypeDatabase::capacity() const { return TypeRecords.size(); }
-
-CVType TypeDatabase::getType(TypeIndex Index) { return getTypeRecord(Index); }
-
-StringRef TypeDatabase::getTypeName(TypeIndex Index) {
-  return static_cast<const TypeDatabase *>(this)->getTypeName(Index);
-}
-
-bool TypeDatabase::contains(TypeIndex Index) {
-  return static_cast<const TypeDatabase *>(this)->contains(Index);
-}
-
-uint32_t TypeDatabase::size() {
-  return static_cast<const TypeDatabase *>(this)->size();
-}
-
-uint32_t TypeDatabase::capacity() {
-  return static_cast<const TypeDatabase *>(this)->capacity();
-}
-
-void TypeDatabase::grow() { grow(LargestTypeIndex + 1); }
-
-void TypeDatabase::grow(TypeIndex NewIndex) {
-  uint32_t NewSize = NewIndex.toArrayIndex() + 1;
-
-  if (NewSize <= capacity())
-    return;
-
-  uint32_t NewCapacity = NewSize * 3 / 2;
-
-  TypeRecords.resize(NewCapacity);
-  CVUDTNames.resize(NewCapacity);
-  ValidRecords.resize(NewCapacity);
-}
-
-bool TypeDatabase::empty() const { return size() == 0; }
-
-Optional<TypeIndex> TypeDatabase::largestTypeIndexLessThan(TypeIndex TI) const {
-  uint32_t AI = TI.toArrayIndex();
-  int N = ValidRecords.find_prev(AI);
-  if (N == -1)
-    return None;
-  return TypeIndex::fromArrayIndex(N);
-}
-
-TypeIndex TypeDatabase::getAppendIndex() const {
-  if (empty())
-    return TypeIndex::fromArrayIndex(0);
-
-  return LargestTypeIndex + 1;
-}
-
-Optional<TypeIndex> TypeDatabase::getFirst() {
-  int N = ValidRecords.find_first();
-  if (N == -1)
-    return None;
-  return TypeIndex::fromArrayIndex(N);
-}
-
-Optional<TypeIndex> TypeDatabase::getNext(TypeIndex Prev) {
-  int N = ValidRecords.find_next(Prev.toArrayIndex());
-  if (N == -1)
-    return None;
-  return TypeIndex::fromArrayIndex(N);
-}
diff --git a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
deleted file mode 100644
index 8d97f8b1cb401..0000000000000
--- a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-//===- TypeDatabaseVisitor.cpp -------------------------------- *- C++ --*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
-
-#include "llvm/ADT/SmallString.h"
-
-using namespace llvm;
-
-using namespace llvm::codeview;
-
-Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record) {
-  assert(!IsInFieldList);
-  // Reset Name to the empty string. If the visitor sets it, we know it.
-  Name = "";
-
-  if (Record.Type == LF_FIELDLIST) {
-    // Record that we're in a field list so that members do not get assigned
-    // type indices.
-    IsInFieldList = true;
-  }
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
-  if (auto EC = visitTypeBegin(Record))
-    return EC;
-
-  CurrentTypeIndex = Index;
-  return Error::success();
-}
-
-StringRef TypeDatabaseVisitor::getTypeName(TypeIndex Index) const {
-  return TypeDB->getTypeName(Index);
-}
-
-StringRef TypeDatabaseVisitor::saveTypeName(StringRef Name) {
-  return TypeDB->saveTypeName(Name);
-}
-
-Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
-  if (CVR.Type == LF_FIELDLIST) {
-    assert(IsInFieldList);
-    IsInFieldList = false;
-  }
-  assert(!IsInFieldList);
-
-  // Record every type that is not a field list member, even if Name is empty.
-  // CVUDTNames is indexed by type index, and must have one entry for every
-  // type.  Field list members are not recorded, and are only referenced by
-  // their containing field list record.
-  if (CurrentTypeIndex)
-    TypeDB->recordType(Name, *CurrentTypeIndex, CVR);
-  else
-    TypeDB->appendType(Name, CVR);
-
-  CurrentTypeIndex.reset();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitMemberBegin(CVMemberRecord &Record) {
-  assert(IsInFieldList);
-  // Reset Name to the empty string. If the visitor sets it, we know it.
-  Name = "";
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitMemberEnd(CVMemberRecord &Record) {
-  assert(IsInFieldList);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            FieldListRecord &FieldList) {
-  Name = "<field list>";
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                            StringIdRecord &String) {
-  // Put this in the database so it gets printed with LF_UDT_SRC_LINE.
-  Name = String.getString();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
-  auto Indices = Args.getIndices();
-  uint32_t Size = Indices.size();
-  SmallString<256> TypeName("(");
-  for (uint32_t I = 0; I < Size; ++I) {
-    StringRef ArgTypeName = getTypeName(Indices[I]);
-    TypeName.append(ArgTypeName);
-    if (I + 1 != Size)
-      TypeName.append(", ");
-  }
-  TypeName.push_back(')');
-  Name = saveTypeName(TypeName);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            StringListRecord &Strings) {
-  auto Indices = Strings.getIndices();
-  uint32_t Size = Indices.size();
-  SmallString<256> TypeName("\"");
-  for (uint32_t I = 0; I < Size; ++I) {
-    StringRef ArgTypeName = getTypeName(Indices[I]);
-    TypeName.append(ArgTypeName);
-    if (I + 1 != Size)
-      TypeName.append("\" \"");
-  }
-  TypeName.push_back('\"');
-  Name = saveTypeName(TypeName);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
-  Name = Class.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, UnionRecord &Union) {
-  Name = Union.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, EnumRecord &Enum) {
-  Name = Enum.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArrayRecord &AT) {
-  Name = AT.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, VFTableRecord &VFT) {
-  Name = VFT.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            MemberFuncIdRecord &Id) {
-  Name = Id.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            ProcedureRecord &Proc) {
-  StringRef ReturnTypeName = getTypeName(Proc.getReturnType());
-  StringRef ArgListTypeName = getTypeName(Proc.getArgumentList());
-  SmallString<256> TypeName(ReturnTypeName);
-  TypeName.push_back(' ');
-  TypeName.append(ArgListTypeName);
-  Name = saveTypeName(TypeName);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            MemberFunctionRecord &MF) {
-  StringRef ReturnTypeName = getTypeName(MF.getReturnType());
-  StringRef ClassTypeName = getTypeName(MF.getClassType());
-  StringRef ArgListTypeName = getTypeName(MF.getArgumentList());
-  SmallString<256> TypeName(ReturnTypeName);
-  TypeName.push_back(' ');
-  TypeName.append(ClassTypeName);
-  TypeName.append("::");
-  TypeName.append(ArgListTypeName);
-  Name = saveTypeName(TypeName);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
-  Name = Func.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            TypeServer2Record &TS) {
-  Name = TS.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
-
-  if (Ptr.isPointerToMember()) {
-    const MemberPointerInfo &MI = Ptr.getMemberInfo();
-
-    StringRef PointeeName = getTypeName(Ptr.getReferentType());
-    StringRef ClassName = getTypeName(MI.getContainingType());
-    SmallString<256> TypeName(PointeeName);
-    TypeName.push_back(' ');
-    TypeName.append(ClassName);
-    TypeName.append("::*");
-    Name = saveTypeName(TypeName);
-  } else {
-    SmallString<256> TypeName;
-    if (Ptr.isConst())
-      TypeName.append("const ");
-    if (Ptr.isVolatile())
-      TypeName.append("volatile ");
-    if (Ptr.isUnaligned())
-      TypeName.append("__unaligned ");
-
-    TypeName.append(getTypeName(Ptr.getReferentType()));
-
-    if (Ptr.getMode() == PointerMode::LValueReference)
-      TypeName.append("&");
-    else if (Ptr.getMode() == PointerMode::RValueReference)
-      TypeName.append("&&");
-    else if (Ptr.getMode() == PointerMode::Pointer)
-      TypeName.append("*");
-
-    if (!TypeName.empty())
-      Name = saveTypeName(TypeName);
-  }
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
-  uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
-
-  StringRef ModifiedName = getTypeName(Mod.getModifiedType());
-  SmallString<256> TypeName;
-  if (Mods & uint16_t(ModifierOptions::Const))
-    TypeName.append("const ");
-  if (Mods & uint16_t(ModifierOptions::Volatile))
-    TypeName.append("volatile ");
-  if (Mods & uint16_t(ModifierOptions::Unaligned))
-    TypeName.append("__unaligned ");
-  TypeName.append(ModifiedName);
-  Name = saveTypeName(TypeName);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            VFTableShapeRecord &Shape) {
-  Name =
-      saveTypeName("<vftable " + utostr(Shape.getEntryCount()) + " methods>");
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            NestedTypeRecord &Nested) {
-  Name = Nested.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            OneMethodRecord &Method) {
-  Name = Method.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            OverloadedMethodRecord &Method) {
-  Name = Method.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            DataMemberRecord &Field) {
-  Name = Field.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            StaticDataMemberRecord &Field) {
-  Name = Field.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            EnumeratorRecord &Enum) {
-  Name = Enum.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            BaseClassRecord &Base) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            VirtualBaseClassRecord &VBase) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            ListContinuationRecord &Cont) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(
-    CVType &CVR, UdtModSourceLineRecord &ModSourceLine) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            UdtSourceLineRecord &SourceLine) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, BitFieldRecord &BF) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(
-    CVType &CVR, MethodOverloadListRecord &Overloads) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, BuildInfoRecord &BI) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, LabelRecord &R) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            VFPtrRecord &VFP) {
-  return Error::success();
-}
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 04b0384d81902..5899667050151 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -13,8 +13,6 @@
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/BinaryByteStream.h"
diff --git a/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp b/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
index 8704cea607867..1226d5be3f3c4 100644
--- a/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
+++ b/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
@@ -357,6 +357,82 @@ static void discoverTypeIndices(ArrayRef<uint8_t> Content, TypeLeafKind Kind,
   }
 }
 
+static bool discoverTypeIndices(ArrayRef<uint8_t> Content, SymbolKind Kind,
+                                SmallVectorImpl<TiReference> &Refs) {
+  uint32_t Count;
+  // FIXME: In the future it would be nice if we could avoid hardcoding these
+  // values.  One idea is to define some structures representing these types
+  // that would allow the use of offsetof().
+  switch (Kind) {
+  case SymbolKind::S_GPROC32:
+  case SymbolKind::S_LPROC32:
+  case SymbolKind::S_GPROC32_ID:
+  case SymbolKind::S_LPROC32_ID:
+  case SymbolKind::S_LPROC32_DPC:
+  case SymbolKind::S_LPROC32_DPC_ID:
+    Refs.push_back({TiRefKind::IndexRef, 24, 1}); // LF_FUNC_ID
+    break;
+  case SymbolKind::S_UDT:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // UDT
+    break;
+  case SymbolKind::S_GDATA32:
+  case SymbolKind::S_LDATA32:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type
+    break;
+  case SymbolKind::S_BUILDINFO:
+    Refs.push_back({TiRefKind::IndexRef, 0, 1}); // Compile flags
+    break;
+  case SymbolKind::S_LOCAL:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type
+    break;
+  case SymbolKind::S_CONSTANT:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type
+    break;
+  case SymbolKind::S_REGREL32:
+    Refs.push_back({TiRefKind::TypeRef, 4, 1}); // Type
+    break;
+  case SymbolKind::S_CALLSITEINFO:
+    Refs.push_back({TiRefKind::TypeRef, 8, 1}); // Call signature
+    break;
+  case SymbolKind::S_CALLERS:
+  case SymbolKind::S_CALLEES:
+    // The record is a count followed by an array of type indices.
+    Count = *reinterpret_cast<const ulittle32_t *>(Content.data());
+    Refs.push_back({TiRefKind::IndexRef, 4, Count}); // Callees
+    break;
+  case SymbolKind::S_INLINESITE:
+    Refs.push_back({TiRefKind::IndexRef, 8, 1}); // ID of inlinee
+    break;
+
+  // Defranges don't have types, just registers and code offsets.
+  case SymbolKind::S_DEFRANGE_REGISTER:
+  case SymbolKind::S_DEFRANGE_REGISTER_REL:
+  case SymbolKind::S_DEFRANGE_FRAMEPOINTER_REL:
+  case SymbolKind::S_DEFRANGE_FRAMEPOINTER_REL_FULL_SCOPE:
+  case SymbolKind::S_DEFRANGE_SUBFIELD_REGISTER:
+  case SymbolKind::S_DEFRANGE_SUBFIELD:
+    break;
+
+  // No type refernces.
+  case SymbolKind::S_LABEL32:
+  case SymbolKind::S_OBJNAME:
+  case SymbolKind::S_COMPILE:
+  case SymbolKind::S_COMPILE2:
+  case SymbolKind::S_COMPILE3:
+  case SymbolKind::S_BLOCK32:
+  case SymbolKind::S_FRAMEPROC:
+    break;
+  // Scope ending symbols.
+  case SymbolKind::S_END:
+  case SymbolKind::S_INLINESITE_END:
+  case SymbolKind::S_PROC_ID_END:
+    break;
+  default:
+    return false; // Unknown symbol.
+  }
+  return true;
+}
+
 void llvm::codeview::discoverTypeIndices(const CVType &Type,
                                          SmallVectorImpl<TiReference> &Refs) {
   ::discoverTypeIndices(Type.content(), Type.kind(), Refs);
@@ -369,3 +445,9 @@ void llvm::codeview::discoverTypeIndices(ArrayRef<uint8_t> RecordData,
   TypeLeafKind K = static_cast<TypeLeafKind>(uint16_t(P->RecordKind));
   ::discoverTypeIndices(RecordData.drop_front(sizeof(RecordPrefix)), K, Refs);
 }
+
+bool llvm::codeview::discoverTypeIndices(const CVSymbol &Sym,
+                                         SmallVectorImpl<TiReference> &Refs) {
+  SymbolKind K = Sym.kind();
+  return ::discoverTypeIndices(Sym.content(), K, Refs);
+}
diff --git a/lib/DebugInfo/CodeView/TypeName.cpp b/lib/DebugInfo/CodeView/TypeName.cpp
new file mode 100644
index 0000000000000..2eb8b81862f3c
--- /dev/null
+++ b/lib/DebugInfo/CodeView/TypeName.cpp
@@ -0,0 +1,243 @@
+//===- TypeName.cpp ------------------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeName.h"
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+namespace {
+class TypeNameComputer : public TypeVisitorCallbacks {
+  /// The type collection.  Used to calculate names of nested types.
+  TypeCollection &Types;
+  TypeIndex CurrentTypeIndex = TypeIndex::None();
+
+  /// Name of the current type. Only valid before visitTypeEnd.
+  SmallString<256> Name;
+
+public:
+  explicit TypeNameComputer(TypeCollection &Types) : Types(Types) {}
+
+  StringRef name() const { return Name; }
+
+  /// Paired begin/end actions for all types. Receives all record data,
+  /// including the fixed-length record prefix.
+  Error visitTypeBegin(CVType &Record) override;
+  Error visitTypeBegin(CVType &Record, TypeIndex Index) override;
+  Error visitTypeEnd(CVType &Record) override;
+
+#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
+  Error visitKnownRecord(CVType &CVR, Name##Record &Record) override;
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#define MEMBER_RECORD(EnumName, EnumVal, Name)
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
+};
+} // namespace
+
+Error TypeNameComputer::visitTypeBegin(CVType &Record) {
+  llvm_unreachable("Must call visitTypeBegin with a TypeIndex!");
+  return Error::success();
+}
+
+Error TypeNameComputer::visitTypeBegin(CVType &Record, TypeIndex Index) {
+  // Reset Name to the empty string. If the visitor sets it, we know it.
+  Name = "";
+  CurrentTypeIndex = Index;
+  return Error::success();
+}
+
+Error TypeNameComputer::visitTypeEnd(CVType &CVR) { return Error::success(); }
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         FieldListRecord &FieldList) {
+  Name = "<field list>";
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
+                                         StringIdRecord &String) {
+  Name = String.getString();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
+  auto Indices = Args.getIndices();
+  uint32_t Size = Indices.size();
+  Name = "(";
+  for (uint32_t I = 0; I < Size; ++I) {
+    assert(Indices[I] < CurrentTypeIndex);
+
+    Name.append(Types.getTypeName(Indices[I]));
+    if (I + 1 != Size)
+      Name.append(", ");
+  }
+  Name.push_back(')');
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         StringListRecord &Strings) {
+  auto Indices = Strings.getIndices();
+  uint32_t Size = Indices.size();
+  Name = "\"";
+  for (uint32_t I = 0; I < Size; ++I) {
+    Name.append(Types.getTypeName(Indices[I]));
+    if (I + 1 != Size)
+      Name.append("\" \"");
+  }
+  Name.push_back('\"');
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
+  Name = Class.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, UnionRecord &Union) {
+  Name = Union.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, EnumRecord &Enum) {
+  Name = Enum.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, ArrayRecord &AT) {
+  Name = AT.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, VFTableRecord &VFT) {
+  Name = VFT.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, MemberFuncIdRecord &Id) {
+  Name = Id.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, ProcedureRecord &Proc) {
+  StringRef Ret = Types.getTypeName(Proc.getReturnType());
+  StringRef Params = Types.getTypeName(Proc.getArgumentList());
+  Name = formatv("{0} {1}", Ret, Params).sstr<256>();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         MemberFunctionRecord &MF) {
+  StringRef Ret = Types.getTypeName(MF.getReturnType());
+  StringRef Class = Types.getTypeName(MF.getClassType());
+  StringRef Params = Types.getTypeName(MF.getArgumentList());
+  Name = formatv("{0} {1}::{2}", Ret, Class, Params).sstr<256>();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
+  Name = Func.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) {
+  Name = TS.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
+
+  if (Ptr.isPointerToMember()) {
+    const MemberPointerInfo &MI = Ptr.getMemberInfo();
+
+    StringRef Pointee = Types.getTypeName(Ptr.getReferentType());
+    StringRef Class = Types.getTypeName(MI.getContainingType());
+    Name = formatv("{0} {1}::*", Pointee, Class);
+  } else {
+    if (Ptr.isConst())
+      Name.append("const ");
+    if (Ptr.isVolatile())
+      Name.append("volatile ");
+    if (Ptr.isUnaligned())
+      Name.append("__unaligned ");
+
+    Name.append(Types.getTypeName(Ptr.getReferentType()));
+
+    if (Ptr.getMode() == PointerMode::LValueReference)
+      Name.append("&");
+    else if (Ptr.getMode() == PointerMode::RValueReference)
+      Name.append("&&");
+    else if (Ptr.getMode() == PointerMode::Pointer)
+      Name.append("*");
+  }
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
+  uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
+
+  SmallString<256> TypeName;
+  if (Mods & uint16_t(ModifierOptions::Const))
+    Name.append("const ");
+  if (Mods & uint16_t(ModifierOptions::Volatile))
+    Name.append("volatile ");
+  if (Mods & uint16_t(ModifierOptions::Unaligned))
+    Name.append("__unaligned ");
+  Name.append(Types.getTypeName(Mod.getModifiedType()));
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         VFTableShapeRecord &Shape) {
+  Name = formatv("<vftable {0} methods>", Shape.getEntryCount());
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(
+    CVType &CVR, UdtModSourceLineRecord &ModSourceLine) {
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         UdtSourceLineRecord &SourceLine) {
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, BitFieldRecord &BF) {
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         MethodOverloadListRecord &Overloads) {
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, BuildInfoRecord &BI) {
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, LabelRecord &R) {
+  return Error::success();
+}
+
+std::string llvm::codeview::computeTypeName(TypeCollection &Types,
+                                            TypeIndex Index) {
+  TypeNameComputer Computer(Types);
+  CVType Record = Types.getType(Index);
+  if (auto EC = visitTypeRecord(Record, Index, Computer)) {
+    consumeError(std::move(EC));
+    return "<unknown UDT>";
+  }
+  return Computer.name();
+}
diff --git a/lib/DebugInfo/CodeView/TypeTableCollection.cpp b/lib/DebugInfo/CodeView/TypeTableCollection.cpp
index 8d974d522f283..4eca5aeaa0ae3 100644
--- a/lib/DebugInfo/CodeView/TypeTableCollection.cpp
+++ b/lib/DebugInfo/CodeView/TypeTableCollection.cpp
@@ -10,7 +10,7 @@
 #include "llvm/DebugInfo/CodeView/TypeTableCollection.h"
 
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeName.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamReader.h"
@@ -18,14 +18,10 @@
 using namespace llvm;
 using namespace llvm::codeview;
 
-static void error(Error &&EC) {
-  assert(!static_cast<bool>(EC));
-  if (EC)
-    consumeError(std::move(EC));
-}
-
 TypeTableCollection::TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records)
-    : Records(Records), Database(Records.size()) {}
+    : NameStorage(Allocator), Records(Records) {
+  Names.resize(Records.size());
+}
 
 Optional<TypeIndex> TypeTableCollection::getFirst() {
   if (empty())
@@ -34,50 +30,38 @@ Optional<TypeIndex> TypeTableCollection::getFirst() {
 }
 
 Optional<TypeIndex> TypeTableCollection::getNext(TypeIndex Prev) {
+  assert(contains(Prev));
   ++Prev;
-  assert(Prev.toArrayIndex() <= size());
   if (Prev.toArrayIndex() == size())
     return None;
   return Prev;
 }
 
-void TypeTableCollection::ensureTypeExists(TypeIndex Index) {
-  assert(hasCapacityFor(Index));
-
-  if (Database.contains(Index))
-    return;
-
-  BinaryByteStream Bytes(Records[Index.toArrayIndex()], support::little);
-
-  CVType Type;
-  uint32_t Len;
-  VarStreamArrayExtractor<CVType> Extract;
-  error(Extract(Bytes, Len, Type));
-
-  TypeDatabaseVisitor DBV(Database);
-  error(codeview::visitTypeRecord(Type, Index, DBV));
-  assert(Database.contains(Index));
-}
-
 CVType TypeTableCollection::getType(TypeIndex Index) {
-  ensureTypeExists(Index);
-  return Database.getTypeRecord(Index);
+  assert(Index.toArrayIndex() < Records.size());
+  ArrayRef<uint8_t> Bytes = Records[Index.toArrayIndex()];
+  const RecordPrefix *Prefix =
+      reinterpret_cast<const RecordPrefix *>(Bytes.data());
+  TypeLeafKind Kind = static_cast<TypeLeafKind>(uint16_t(Prefix->RecordKind));
+  return CVType(Kind, Bytes);
 }
 
 StringRef TypeTableCollection::getTypeName(TypeIndex Index) {
-  if (!Index.isSimple())
-    ensureTypeExists(Index);
-  return Database.getTypeName(Index);
+  if (Index.isNoneType() || Index.isSimple())
+    return TypeIndex::simpleTypeName(Index);
+
+  uint32_t I = Index.toArrayIndex();
+  if (Names[I].data() == nullptr) {
+    StringRef Result = NameStorage.save(computeTypeName(*this, Index));
+    Names[I] = Result;
+  }
+  return Names[I];
 }
 
 bool TypeTableCollection::contains(TypeIndex Index) {
-  return Database.contains(Index);
+  return Index.toArrayIndex() <= size();
 }
 
 uint32_t TypeTableCollection::size() { return Records.size(); }
 
 uint32_t TypeTableCollection::capacity() { return Records.size(); }
-
-bool TypeTableCollection::hasCapacityFor(TypeIndex Index) const {
-  return Index.toArrayIndex() < Records.size();
-}
diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index 57eac91f8c192..bb475a669efb2 100644
--- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -65,46 +65,52 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
     if (A && F) {
       Optional<int64_t> V;
       bool IsImplicitConst = (F == DW_FORM_implicit_const);
-      if (IsImplicitConst)
+      if (IsImplicitConst) {
         V = Data.getSLEB128(OffsetPtr);
-      else if (auto Size = DWARFFormValue::getFixedByteSize(F))
-        V = *Size;
-      AttributeSpecs.push_back(AttributeSpec(A, F, V));
-      if (IsImplicitConst)
+        AttributeSpecs.push_back(AttributeSpec(A, F, V));
         continue;
+      }
       // If this abbrevation still has a fixed byte size, then update the
       // FixedAttributeSize as needed.
-      if (FixedAttributeSize) {
-        if (V)
-          FixedAttributeSize->NumBytes += *V;
-        else {
-          switch (F) {
-          case DW_FORM_addr:
-            ++FixedAttributeSize->NumAddrs;
-            break;
-
-          case DW_FORM_ref_addr:
-            ++FixedAttributeSize->NumRefAddrs;
-            break;
-
-          case DW_FORM_strp:
-          case DW_FORM_GNU_ref_alt:
-          case DW_FORM_GNU_strp_alt:
-          case DW_FORM_line_strp:
-          case DW_FORM_sec_offset:
-          case DW_FORM_strp_sup:
-            ++FixedAttributeSize->NumDwarfOffsets;
-            break;
-
-          default:
-            // Indicate we no longer have a fixed byte size for this
-            // abbreviation by clearing the FixedAttributeSize optional value
-            // so it doesn't have a value.
-            FixedAttributeSize.reset();
-            break;
-          }
+      switch (F) {
+      case DW_FORM_addr:
+        if (FixedAttributeSize)
+          ++FixedAttributeSize->NumAddrs;
+        break;
+
+      case DW_FORM_ref_addr:
+        if (FixedAttributeSize)
+          ++FixedAttributeSize->NumRefAddrs;
+        break;
+
+      case DW_FORM_strp:
+      case DW_FORM_GNU_ref_alt:
+      case DW_FORM_GNU_strp_alt:
+      case DW_FORM_line_strp:
+      case DW_FORM_sec_offset:
+      case DW_FORM_strp_sup:
+        if (FixedAttributeSize)
+          ++FixedAttributeSize->NumDwarfOffsets;
+        break;
+
+      default:
+        // The form has a byte size that doesn't depend on Params.
+        // If it's a fixed size, keep track of it.
+        if (auto Size =
+                DWARFFormValue::getFixedByteSize(F, DWARFFormParams())) {
+          V = *Size;
+          if (FixedAttributeSize)
+            FixedAttributeSize->NumBytes += *V;
+          break;
         }
+        // Indicate we no longer have a fixed byte size for this
+        // abbreviation by clearing the FixedAttributeSize optional value
+        // so it doesn't have a value.
+        FixedAttributeSize.reset();
+        break;
       }
+      // Record this attribute and its fixed size if it has one.
+      AttributeSpecs.push_back(AttributeSpec(A, F, V));
     } else if (A == 0 && F == 0) {
       // We successfully reached the end of this abbreviation declaration
       // since both attribute and form are zero.
@@ -186,7 +192,8 @@ Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
     if (auto FixedSize = Spec.getByteSize(U))
       Offset += *FixedSize;
     else
-      DWARFFormValue::skipValue(Spec.Form, DebugInfoData, &Offset, &U);
+      DWARFFormValue::skipValue(Spec.Form, DebugInfoData, &Offset,
+                                U.getFormParams());
     ++AttrIndex;
   }
   return None;
@@ -211,7 +218,8 @@ Optional<int64_t> DWARFAbbreviationDeclaration::AttributeSpec::getByteSize(
   if (ByteSizeOrValue)
     return ByteSizeOrValue;
   Optional<int64_t> S;
-  auto FixedByteSize = DWARFFormValue::getFixedByteSize(Form, &U);
+  auto FixedByteSize =
+      DWARFFormValue::getFixedByteSize(Form, U.getFormParams());
   if (FixedByteSize)
     S = *FixedByteSize;
   return S;
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 9bafcde57f0ae..3814794617503 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
@@ -36,7 +37,6 @@
 #include "llvm/Object/RelocVisitor.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -44,8 +44,8 @@
 #include <algorithm>
 #include <cstdint>
 #include <map>
-#include <set>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -55,9 +55,9 @@ using namespace object;
 
 #define DEBUG_TYPE "dwarf"
 
-typedef DWARFDebugLine::LineTable DWARFLineTable;
-typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
-typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind;
+using DWARFLineTable = DWARFDebugLine::LineTable;
+using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind;
+using FunctionNameKind = DILineInfoSpecifier::FunctionNameKind;
 
 uint64_t llvm::getRelocatedValue(const DataExtractor &Data, uint32_t Size,
                                  uint32_t *Off, const RelocAddrMap *Relocs,
@@ -201,8 +201,7 @@ static void dumpStringOffsetsSection(raw_ostream &OS, StringRef SectionName,
   }
 }
 
-void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts){
-
+void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
   DIDumpType DumpType = DumpOpts.DumpType;
   bool DumpEH = DumpOpts.DumpEH;
   bool SummarizeTypes = DumpOpts.SummarizeTypes;
@@ -1068,7 +1067,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
         errs() << "error: failed to compute relocation: " << Name << "\n";
         continue;
       }
-      llvm::RelocAddrEntry Rel = {SymInfoOrErr->SectionIndex, Val};
+      RelocAddrEntry Rel = {SymInfoOrErr->SectionIndex, Val};
       Map->insert({Reloc.getOffset(), Rel});
     }
   }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index cf9fec2b3254c..475cf25b781b4 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
-
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
@@ -71,7 +70,7 @@ protected:
 
   /// An entry may contain CFI instructions. An instruction consists of an
   /// opcode and an optional sequence of operands.
-  typedef std::vector<uint64_t> Operands;
+  using Operands = std::vector<uint64_t>;
   struct Instruction {
     Instruction(uint8_t Opcode)
       : Opcode(Opcode)
@@ -518,14 +517,13 @@ static uint64_t readPointer(const DataExtractor &Data, uint32_t &Offset,
 // noreturn attribute usage in lambdas. Once the support for those
 // compilers are phased out, we can remove this and return back to
 // a ReportError lambda: [StartOffset](const char *ErrorMsg).
-#define ReportError(ErrorMsg) ReportErrorImpl(StartOffset,ErrorMsg)
-static void LLVM_ATTRIBUTE_NORETURN
-ReportErrorImpl(uint32_t StartOffset, const char *ErrorMsg) {
-      std::string Str;
-      raw_string_ostream OS(Str);
-      OS << format(ErrorMsg, StartOffset);
-      OS.flush();
-      report_fatal_error(Str);
+static void LLVM_ATTRIBUTE_NORETURN ReportError(uint32_t StartOffset,
+                                                const char *ErrorMsg) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  OS << format(ErrorMsg, StartOffset);
+  OS.flush();
+  report_fatal_error(Str);
 }
 
 void DWARFDebugFrame::parse(DataExtractor Data) {
@@ -590,13 +588,15 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
         for (unsigned i = 0, e = AugmentationString.size(); i != e; ++i) {
           switch (AugmentationString[i]) {
             default:
-              ReportError("Unknown augmentation character in entry at %lx");
+              ReportError(StartOffset,
+                          "Unknown augmentation character in entry at %lx");
             case 'L':
               LSDAPointerEncoding = Data.getU8(&Offset);
               break;
             case 'P': {
               if (Personality)
-                ReportError("Duplicate personality in entry at %lx");
+                ReportError(StartOffset,
+                            "Duplicate personality in entry at %lx");
               PersonalityEncoding = Data.getU8(&Offset);
               Personality = readPointer(Data, Offset, *PersonalityEncoding);
               break;
@@ -606,7 +606,8 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
               break;
             case 'z':
               if (i)
-                ReportError("'z' must be the first character at %lx");
+                ReportError(StartOffset,
+                            "'z' must be the first character at %lx");
               // Parse the augmentation length first.  We only parse it if
               // the string contains a 'z'.
               AugmentationLength = Data.getULEB128(&Offset);
@@ -618,7 +619,7 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
 
         if (AugmentationLength.hasValue()) {
           if (Offset != EndAugmentationOffset)
-            ReportError("Parsing augmentation data at %lx failed");
+            ReportError(StartOffset, "Parsing augmentation data at %lx failed");
 
           AugmentationData = Data.getData().slice(StartAugmentationOffset,
                                                   EndAugmentationOffset);
@@ -645,7 +646,8 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
       if (IsEH) {
         // The address size is encoded in the CIE we reference.
         if (!Cie)
-          ReportError("Parsing FDE data at %lx failed due to missing CIE");
+          ReportError(StartOffset,
+                      "Parsing FDE data at %lx failed due to missing CIE");
 
         InitialLocation = readPointer(Data, Offset,
                                       Cie->getFDEPointerEncoding());
@@ -665,7 +667,7 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
             readPointer(Data, Offset, Cie->getLSDAPointerEncoding());
 
           if (Offset != EndAugmentationOffset)
-            ReportError("Parsing augmentation data at %lx failed");
+            ReportError(StartOffset, "Parsing augmentation data at %lx failed");
         }
       } else {
         InitialLocation = Data.getAddress(&Offset);
@@ -680,7 +682,7 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
     Entries.back()->parseInstructions(Data, &Offset, EndStructureOffset);
 
     if (Offset != EndStructureOffset)
-      ReportError("Parsing entry instructions at %lx failed");
+      ReportError(StartOffset, "Parsing entry instructions at %lx failed");
   }
 }
 
diff --git a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index dbcc64fc0832f..1551974b822ac 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -59,7 +59,7 @@ bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint32_t *OffsetPtr,
       // Attribute byte size if fixed, just add the size to the offset.
       *OffsetPtr += *FixedSize;
     } else if (!DWARFFormValue::skipValue(AttrSpec.Form, DebugInfoData,
-                                          OffsetPtr, &U)) {
+                                          OffsetPtr, U.getFormParams())) {
       // We failed to skip this attribute's value, restore the original offset
       // and return the failure status.
       *OffsetPtr = Offset;
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index cda3e75fbc3e7..ad5647f3e03d8 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -9,6 +9,8 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
@@ -26,23 +28,27 @@
 using namespace llvm;
 using namespace dwarf;
 
-typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
+using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind;
+
 namespace {
+
 struct ContentDescriptor {
   dwarf::LineNumberEntryFormat Type;
   dwarf::Form Form;
 };
-typedef SmallVector<ContentDescriptor, 4> ContentDescriptors;
+
+using ContentDescriptors = SmallVector<ContentDescriptor, 4>;
+
 } // end anonmyous namespace
 
 DWARFDebugLine::Prologue::Prologue() { clear(); }
 
 void DWARFDebugLine::Prologue::clear() {
-  TotalLength = Version = PrologueLength = 0;
-  AddressSize = SegSelectorSize = 0;
+  TotalLength = PrologueLength = 0;
+  SegSelectorSize = 0;
   MinInstLength = MaxOpsPerInst = DefaultIsStmt = LineBase = LineRange = 0;
   OpcodeBase = 0;
-  IsDWARF64 = false;
+  FormParams = DWARFFormParams({0, 0, DWARF32});
   StandardOpcodeLengths.clear();
   IncludeDirectories.clear();
   FileNames.clear();
@@ -51,12 +57,13 @@ void DWARFDebugLine::Prologue::clear() {
 void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
   OS << "Line table prologue:\n"
      << format("    total_length: 0x%8.8" PRIx64 "\n", TotalLength)
-     << format("         version: %u\n", Version)
-     << format(Version >= 5 ? "    address_size: %u\n" : "", AddressSize)
-     << format(Version >= 5 ? " seg_select_size: %u\n" : "", SegSelectorSize)
-     << format(" prologue_length: 0x%8.8" PRIx64 "\n", PrologueLength)
+     << format("         version: %u\n", getVersion());
+  if (getVersion() >= 5)
+    OS << format("    address_size: %u\n", getAddressSize())
+       << format(" seg_select_size: %u\n", SegSelectorSize);
+  OS << format(" prologue_length: 0x%8.8" PRIx64 "\n", PrologueLength)
      << format(" min_inst_length: %u\n", MinInstLength)
-     << format(Version >= 4 ? "max_ops_per_inst: %u\n" : "", MaxOpsPerInst)
+     << format(getVersion() >= 4 ? "max_ops_per_inst: %u\n" : "", MaxOpsPerInst)
      << format(" default_is_stmt: %u\n", DefaultIsStmt)
      << format("       line_base: %i\n", LineBase)
      << format("      line_range: %u\n", LineRange)
@@ -137,6 +144,7 @@ parseV5EntryFormat(DataExtractor DebugLineData, uint32_t *OffsetPtr,
 static bool
 parseV5DirFileTables(DataExtractor DebugLineData, uint32_t *OffsetPtr,
                      uint64_t EndPrologueOffset,
+                     const DWARFFormParams &FormParams,
                      std::vector<StringRef> &IncludeDirectories,
                      std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
   // Get the directory entry description.
@@ -159,7 +167,7 @@ parseV5DirFileTables(DataExtractor DebugLineData, uint32_t *OffsetPtr,
         IncludeDirectories.push_back(Value.getAsCString().getValue());
         break;
       default:
-        if (!Value.skipValue(DebugLineData, OffsetPtr, nullptr))
+        if (!Value.skipValue(DebugLineData, OffsetPtr, FormParams))
           return false;
       }
     }
@@ -211,24 +219,26 @@ bool DWARFDebugLine::Prologue::parse(DataExtractor DebugLineData,
   clear();
   TotalLength = DebugLineData.getU32(OffsetPtr);
   if (TotalLength == UINT32_MAX) {
-    IsDWARF64 = true;
+    FormParams.Format = dwarf::DWARF64;
     TotalLength = DebugLineData.getU64(OffsetPtr);
-  } else if (TotalLength > 0xffffff00) {
+  } else if (TotalLength >= 0xffffff00) {
     return false;
   }
-  Version = DebugLineData.getU16(OffsetPtr);
-  if (Version < 2)
+  FormParams.Version = DebugLineData.getU16(OffsetPtr);
+  if (getVersion() < 2)
     return false;
 
-  if (Version >= 5) {
-    AddressSize = DebugLineData.getU8(OffsetPtr);
+  if (getVersion() >= 5) {
+    FormParams.AddrSize = DebugLineData.getU8(OffsetPtr);
+    assert(getAddressSize() == DebugLineData.getAddressSize() &&
+           "Line table header and data extractor disagree");
     SegSelectorSize = DebugLineData.getU8(OffsetPtr);
   }
 
   PrologueLength = DebugLineData.getUnsigned(OffsetPtr, sizeofPrologueLength());
   const uint64_t EndPrologueOffset = PrologueLength + *OffsetPtr;
   MinInstLength = DebugLineData.getU8(OffsetPtr);
-  if (Version >= 4)
+  if (getVersion() >= 4)
     MaxOpsPerInst = DebugLineData.getU8(OffsetPtr);
   DefaultIsStmt = DebugLineData.getU8(OffsetPtr);
   LineBase = DebugLineData.getU8(OffsetPtr);
@@ -241,9 +251,9 @@ bool DWARFDebugLine::Prologue::parse(DataExtractor DebugLineData,
     StandardOpcodeLengths.push_back(OpLen);
   }
 
-  if (Version >= 5) {
+  if (getVersion() >= 5) {
     if (!parseV5DirFileTables(DebugLineData, OffsetPtr, EndPrologueOffset,
-                              IncludeDirectories, FileNames)) {
+                              getFormParams(), IncludeDirectories, FileNames)) {
       fprintf(stderr,
               "warning: parsing line table prologue at 0x%8.8" PRIx64
               " found an invalid directory or file table description at"
@@ -333,7 +343,7 @@ void DWARFDebugLine::LineTable::clear() {
 }
 
 DWARFDebugLine::ParsingState::ParsingState(struct LineTable *LT)
-    : LineTable(LT), RowNumber(0) {
+    : LineTable(LT) {
   resetRowAndSequence();
 }
 
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index ed1f5f46dcfb8..861114bde1f2b 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -59,48 +59,13 @@ static const DWARFFormValue::FormClass DWARF4FormClasses[] = {
     DWARFFormValue::FC_Flag,          // 0x19 DW_FORM_flag_present
 };
 
-namespace {
-
-/// A helper class that can be used in DWARFFormValue.cpp functions that need
-/// to know the byte size of DW_FORM values that vary in size depending on the
-/// DWARF version, address byte size, or DWARF32 or DWARF64.
-class FormSizeHelper {
-  uint16_t Version;
-  uint8_t AddrSize;
-  llvm::dwarf::DwarfFormat Format;
-
-public:
-  FormSizeHelper(uint16_t V, uint8_t A, llvm::dwarf::DwarfFormat F)
-      : Version(V), AddrSize(A), Format(F) {}
-
-  uint8_t getAddressByteSize() const { return AddrSize; }
-
-  uint8_t getRefAddrByteSize() const {
-    if (Version == 2)
-      return AddrSize;
-    return getDwarfOffsetByteSize();
-  }
-
-  uint8_t getDwarfOffsetByteSize() const {
-    switch (Format) {
-    case dwarf::DwarfFormat::DWARF32:
-      return 4;
-    case dwarf::DwarfFormat::DWARF64:
-      return 8;
-    }
-    llvm_unreachable("Invalid Format value");
-  }
-};
-
-} // end anonymous namespace
-
-template <class T>
-static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
+Optional<uint8_t>
+DWARFFormValue::getFixedByteSize(dwarf::Form Form,
+                                 const DWARFFormParams Params) {
   switch (Form) {
   case DW_FORM_addr:
-    if (U)
-      return U->getAddressByteSize();
-    return None;
+    assert(Params.Version && Params.AddrSize && "Invalid Params for form");
+    return Params.AddrSize;
 
   case DW_FORM_block:          // ULEB128 length L followed by L bytes.
   case DW_FORM_block1:         // 1 byte length L followed by L bytes.
@@ -121,9 +86,8 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
     return None;
 
   case DW_FORM_ref_addr:
-    if (U)
-      return U->getRefAddrByteSize();
-    return None;
+    assert(Params.Version && Params.AddrSize && "Invalid Params for form");
+    return Params.getRefAddrByteSize();
 
   case DW_FORM_flag:
   case DW_FORM_data1:
@@ -138,6 +102,9 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
   case DW_FORM_addrx2:
     return 2;
 
+  case DW_FORM_strx3:
+    return 3;
+
   case DW_FORM_data4:
   case DW_FORM_ref4:
   case DW_FORM_ref_sup4:
@@ -151,9 +118,8 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
   case DW_FORM_line_strp:
   case DW_FORM_sec_offset:
   case DW_FORM_strp_sup:
-    if (U)
-      return U->getDwarfOffsetByteSize();
-    return None;
+    assert(Params.Version && Params.AddrSize && "Invalid Params for form");
+    return Params.getDwarfOffsetByteSize();
 
   case DW_FORM_data8:
   case DW_FORM_ref8:
@@ -178,9 +144,9 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
   return None;
 }
 
-template <class T>
-static bool skipFormValue(dwarf::Form Form, const DataExtractor &DebugInfoData,
-                          uint32_t *OffsetPtr, const T *U) {
+bool DWARFFormValue::skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
+                               uint32_t *OffsetPtr,
+                               const DWARFFormParams Params) {
   bool Indirect = false;
   do {
     switch (Form) {
@@ -240,7 +206,8 @@ static bool skipFormValue(dwarf::Form Form, const DataExtractor &DebugInfoData,
     case DW_FORM_line_strp:
     case DW_FORM_GNU_ref_alt:
     case DW_FORM_GNU_strp_alt:
-      if (Optional<uint8_t> FixedSize = ::getFixedByteSize(Form, U)) {
+      if (Optional<uint8_t> FixedSize =
+              DWARFFormValue::getFixedByteSize(Form, Params)) {
         *OffsetPtr += *FixedSize;
         return true;
       }
@@ -274,19 +241,6 @@ static bool skipFormValue(dwarf::Form Form, const DataExtractor &DebugInfoData,
   return true;
 }
 
-Optional<uint8_t> DWARFFormValue::getFixedByteSize(dwarf::Form Form,
-                                                   const DWARFUnit *U) {
-  return ::getFixedByteSize(Form, U);
-}
-
-Optional<uint8_t>
-DWARFFormValue::getFixedByteSize(dwarf::Form Form, uint16_t Version,
-                                 uint8_t AddrSize,
-                                 llvm::dwarf::DwarfFormat Format) {
-  FormSizeHelper FSH(Version, AddrSize, Format);
-  return ::getFixedByteSize(Form, &FSH);
-}
-
 bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
   // First, check DWARF4 form classes.
   if (Form < makeArrayRef(DWARF4FormClasses).size() &&
@@ -302,6 +256,10 @@ bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
   case DW_FORM_GNU_str_index:
   case DW_FORM_GNU_strp_alt:
   case DW_FORM_strx:
+  case DW_FORM_strx1:
+  case DW_FORM_strx2:
+  case DW_FORM_strx3:
+  case DW_FORM_strx4:
     return (FC == FC_String);
   case DW_FORM_implicit_const:
     return (FC == FC_Constant);
@@ -368,6 +326,9 @@ bool DWARFFormValue::extractValue(const DataExtractor &Data,
     case DW_FORM_addrx2:
       Value.uval = Data.getU16(OffsetPtr);
       break;
+    case DW_FORM_strx3:
+      Value.uval = Data.getU24(OffsetPtr);
+      break;
     case DW_FORM_data4:
     case DW_FORM_ref4:
     case DW_FORM_ref_sup4:
@@ -438,24 +399,6 @@ bool DWARFFormValue::extractValue(const DataExtractor &Data,
   return true;
 }
 
-bool DWARFFormValue::skipValue(DataExtractor DebugInfoData, uint32_t *OffsetPtr,
-                               const DWARFUnit *U) const {
-  return DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr, U);
-}
-
-bool DWARFFormValue::skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
-                               uint32_t *OffsetPtr, const DWARFUnit *U) {
-  return skipFormValue(Form, DebugInfoData, OffsetPtr, U);
-}
-
-bool DWARFFormValue::skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
-                               uint32_t *OffsetPtr, uint16_t Version,
-                               uint8_t AddrSize,
-                               llvm::dwarf::DwarfFormat Format) {
-  FormSizeHelper FSH(Version, AddrSize, Format);
-  return skipFormValue(Form, DebugInfoData, OffsetPtr, &FSH);
-}
-
 void DWARFFormValue::dump(raw_ostream &OS) const {
   uint64_t UValue = Value.uval;
   bool CURelativeOffset = false;
@@ -545,6 +488,10 @@ void DWARFFormValue::dump(raw_ostream &OS) const {
     dumpString(OS);
     break;
   case DW_FORM_strx:
+  case DW_FORM_strx1:
+  case DW_FORM_strx2:
+  case DW_FORM_strx3:
+  case DW_FORM_strx4:
   case DW_FORM_GNU_str_index:
     OS << format(" indexed (%8.8x) string = ", (uint32_t)UValue);
     dumpString(OS);
@@ -623,7 +570,9 @@ Optional<const char *> DWARFFormValue::getAsCString() const {
   if (Form == DW_FORM_GNU_strp_alt || U == nullptr)
     return None;
   uint32_t Offset = Value.uval;
-  if (Form == DW_FORM_GNU_str_index || Form == DW_FORM_strx) {
+  if (Form == DW_FORM_GNU_str_index || Form == DW_FORM_strx ||
+      Form == DW_FORM_strx1 || Form == DW_FORM_strx2 || Form == DW_FORM_strx3 ||
+      Form == DW_FORM_strx4) {
     uint64_t StrOffset;
     if (!U->getStringOffsetSectionItem(Offset, StrOffset))
       return None;
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 09e6a292e5fe1..fd9c7c2b1d46c 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFUnit.cpp -----------------------------------------------------===//
+//===- DWARFUnit.cpp ------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
@@ -17,8 +16,6 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Path.h"
 #include <algorithm>
@@ -26,6 +23,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -55,8 +53,8 @@ DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
                      const DWARFUnitIndex::Entry *IndexEntry)
     : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS),
       LineSection(LS), StringSection(SS), StringOffsetSection(SOS),
-      StringOffsetSectionBase(0), AddrOffsetSection(AOS), isLittleEndian(LE),
-      isDWO(IsDWO), UnitSection(UnitSection), IndexEntry(IndexEntry) {
+      AddrOffsetSection(AOS), isLittleEndian(LE), isDWO(IsDWO),
+      UnitSection(UnitSection), IndexEntry(IndexEntry) {
   clear();
 }
 
@@ -64,11 +62,13 @@ DWARFUnit::~DWARFUnit() = default;
 
 bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
                                                 uint64_t &Result) const {
-  uint32_t Offset = AddrOffsetSectionBase + Index * AddrSize;
-  if (AddrOffsetSection->Data.size() < Offset + AddrSize)
+  uint32_t Offset = AddrOffsetSectionBase + Index * getAddressByteSize();
+  if (AddrOffsetSection->Data.size() < Offset + getAddressByteSize())
     return false;
-  DataExtractor DA(AddrOffsetSection->Data, isLittleEndian, AddrSize);
-  Result = getRelocatedValue(DA, AddrSize, &Offset, &AddrOffsetSection->Relocs);
+  DataExtractor DA(AddrOffsetSection->Data, isLittleEndian,
+                   getAddressByteSize());
+  Result = getRelocatedValue(DA, getAddressByteSize(), &Offset,
+                             &AddrOffsetSection->Relocs);
   return true;
 }
 
@@ -94,15 +94,17 @@ uint64_t DWARFUnit::getStringOffsetSectionRelocation(uint32_t Index) const {
 
 bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
   Length = debug_info.getU32(offset_ptr);
-  Version = debug_info.getU16(offset_ptr);
+  // FIXME: Support DWARF64.
+  FormParams.Format = DWARF32;
+  FormParams.Version = debug_info.getU16(offset_ptr);
   uint64_t AbbrOffset;
-  if (Version >= 5) {
+  if (FormParams.Version >= 5) {
     UnitType = debug_info.getU8(offset_ptr);
-    AddrSize = debug_info.getU8(offset_ptr);
+    FormParams.AddrSize = debug_info.getU8(offset_ptr);
     AbbrOffset = debug_info.getU32(offset_ptr);
   } else {
     AbbrOffset = debug_info.getU32(offset_ptr);
-    AddrSize = debug_info.getU8(offset_ptr);
+    FormParams.AddrSize = debug_info.getU8(offset_ptr);
   }
   if (IndexEntry) {
     if (AbbrOffset)
@@ -117,14 +119,14 @@ bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
   }
 
   bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
-  bool VersionOK = DWARFContext::isSupportedVersion(Version);
-  bool AddrSizeOK = AddrSize == 4 || AddrSize == 8;
+  bool VersionOK = DWARFContext::isSupportedVersion(getVersion());
+  bool AddrSizeOK = getAddressByteSize() == 4 || getAddressByteSize() == 8;
 
   if (!LengthOK || !VersionOK || !AddrSizeOK)
     return false;
 
   // Keep track of the highest DWARF version we encounter across all units.
-  Context.setMaxVersionIfGreater(Version);
+  Context.setMaxVersionIfGreater(getVersion());
 
   Abbrevs = Abbrev->getAbbreviationDeclarationSet(AbbrOffset);
   return Abbrevs != nullptr;
@@ -150,7 +152,8 @@ bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
                                         DWARFDebugRangeList &RangeList) const {
   // Require that compile unit is extracted.
   assert(!DieArray.empty());
-  DataExtractor RangesData(RangeSection->Data, isLittleEndian, AddrSize);
+  DataExtractor RangesData(RangeSection->Data, isLittleEndian,
+                           getAddressByteSize());
   uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
   return RangeList.extract(RangesData, &ActualRangeListOffset,
                            RangeSection->Relocs);
@@ -159,9 +162,8 @@ bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
 void DWARFUnit::clear() {
   Offset = 0;
   Length = 0;
-  Version = 0;
   Abbrevs = nullptr;
-  AddrSize = 0;
+  FormParams = DWARFFormParams({0, 0, DWARF32});
   BaseAddr = 0;
   RangeSectionBase = 0;
   AddrOffsetSectionBase = 0;
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index a6240fb60143c..41907e5705637 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -279,7 +279,6 @@ bool DWARFVerifier::handleDebugLine() {
 
 bool DWARFVerifier::handleAppleNames() {
   NumAppleNamesErrors = 0;
-  OS << "Verifying .apple_names...\n";
 
   DataExtractor AppleNamesSection(DCtx.getAppleNamesSection().Data,
                                   DCtx.isLittleEndian(), 0);
@@ -288,10 +287,11 @@ bool DWARFVerifier::handleAppleNames() {
                                    DCtx.getAppleNamesSection().Relocs);
 
   if (!AppleNames.extract()) {
-    OS << "error: cannot extract .apple_names accelerator table\n";
-    return false;
+    return true;
   }
 
+  OS << "Verifying .apple_names...\n";
+
   // Verify that all buckets have a valid hash index or are empty
   uint32_t NumBuckets = AppleNames.getNumBuckets();
   uint32_t NumHashes = AppleNames.getNumHashes();
diff --git a/lib/DebugInfo/MSF/MSFBuilder.cpp b/lib/DebugInfo/MSF/MSFBuilder.cpp
index 5b1b5d8dc4d55..0f4f785abf55a 100644
--- a/lib/DebugInfo/MSF/MSFBuilder.cpp
+++ b/lib/DebugInfo/MSF/MSFBuilder.cpp
@@ -1,3 +1,4 @@
+//===- MSFBuilder.cpp -----------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,22 +7,30 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MSFError.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::msf;
 using namespace llvm::support;
 
-namespace {
-const uint32_t kSuperBlockBlock = 0;
-const uint32_t kFreePageMap0Block = 1;
-const uint32_t kFreePageMap1Block = 2;
-const uint32_t kNumReservedPages = 3;
+static const uint32_t kSuperBlockBlock = 0;
+static const uint32_t kFreePageMap0Block = 1;
+static const uint32_t kFreePageMap1Block = 2;
+static const uint32_t kNumReservedPages = 3;
 
-const uint32_t kDefaultFreePageMap = kFreePageMap0Block;
-const uint32_t kDefaultBlockMapAddr = kNumReservedPages;
-}
+static const uint32_t kDefaultFreePageMap = kFreePageMap0Block;
+static const uint32_t kDefaultBlockMapAddr = kNumReservedPages;
 
 MSFBuilder::MSFBuilder(uint32_t BlockSize, uint32_t MinBlockCount, bool CanGrow,
                        BumpPtrAllocator &Allocator)
@@ -263,7 +272,7 @@ Expected<MSFLayout> MSFBuilder::build() {
 
   // The stream sizes should be re-allocated as a stable pointer and the stream
   // map should have each of its entries allocated as a separate stable pointer.
-  if (StreamData.size() > 0) {
+  if (!StreamData.empty()) {
     ulittle32_t *Sizes = Allocator.Allocate<ulittle32_t>(StreamData.size());
     L.StreamSizes = ArrayRef<ulittle32_t>(Sizes, StreamData.size());
     L.StreamMap.resize(StreamData.size());
diff --git a/lib/DebugInfo/MSF/MSFCommon.cpp b/lib/DebugInfo/MSF/MSFCommon.cpp
index fdab7884646ec..1facf5efb4bbb 100644
--- a/lib/DebugInfo/MSF/MSFCommon.cpp
+++ b/lib/DebugInfo/MSF/MSFCommon.cpp
@@ -1,4 +1,4 @@
-//===- MSFCommon.cpp - Common types and functions for MSF files -*- C++ -*-===//
+//===- MSFCommon.cpp - Common types and functions for MSF files -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,6 +9,10 @@
 
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/DebugInfo/MSF/MSFError.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <cstring>
 
 using namespace llvm;
 using namespace llvm::msf;
diff --git a/lib/DebugInfo/MSF/MappedBlockStream.cpp b/lib/DebugInfo/MSF/MappedBlockStream.cpp
index faf2442bc94bb..e45f4ae0ed940 100644
--- a/lib/DebugInfo/MSF/MappedBlockStream.cpp
+++ b/lib/DebugInfo/MSF/MappedBlockStream.cpp
@@ -8,23 +8,33 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-
-#include "llvm/DebugInfo/MSF/IMSFFile.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/DebugInfo/MSF/MSFStreamLayout.h"
-#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::msf;
 
 namespace {
+
 template <typename Base> class MappedBlockStreamImpl : public Base {
 public:
   template <typename... Args>
   MappedBlockStreamImpl(Args &&... Params)
       : Base(std::forward<Args>(Params)...) {}
 };
-}
+
+} // end anonymous namespace
 
 static void initializeFpmStreamLayout(const MSFLayout &Layout,
                                       MSFStreamLayout &FpmLayout) {
@@ -39,7 +49,8 @@ static void initializeFpmStreamLayout(const MSFLayout &Layout,
   FpmLayout.Length = msf::getFullFpmByteSize(Layout);
 }
 
-typedef std::pair<uint32_t, uint32_t> Interval;
+using Interval = std::pair<uint32_t, uint32_t>;
+
 static Interval intersect(const Interval &I1, const Interval &I2) {
   return std::make_pair(std::max(I1.first, I2.first),
                         std::min(I1.second, I2.second));
@@ -214,7 +225,7 @@ bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
   uint32_t OffsetInBlock = Offset % BlockSize;
   uint32_t BytesFromFirstBlock = std::min(Size, BlockSize - OffsetInBlock);
   uint32_t NumAdditionalBlocks =
-      llvm::alignTo(Size - BytesFromFirstBlock, BlockSize) / BlockSize;
+      alignTo(Size - BytesFromFirstBlock, BlockSize) / BlockSize;
 
   uint32_t RequiredContiguousBlocks = NumAdditionalBlocks + 1;
   uint32_t E = StreamLayout.Blocks[BlockNum];
diff --git a/lib/DebugInfo/PDB/DIA/DIASession.cpp b/lib/DebugInfo/PDB/DIA/DIASession.cpp
index ef47b92b4f2f3..ef9390cda3127 100644
--- a/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -151,7 +151,7 @@ void DIASession::setLoadAddress(uint64_t Address) {
   Session->put_loadAddress(Address);
 }
 
-std::unique_ptr<PDBSymbolExe> DIASession::getGlobalScope() const {
+std::unique_ptr<PDBSymbolExe> DIASession::getGlobalScope() {
   CComPtr<IDiaSymbol> GlobalScope;
   if (S_OK != Session->get_globalScope(&GlobalScope))
     return nullptr;
diff --git a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
index 81a9d3eeec619..745dd742aadc3 100644
--- a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -51,6 +51,7 @@ DbiModuleDescriptorBuilder::DbiModuleDescriptorBuilder(StringRef ModuleName,
                                                        uint32_t ModIndex,
                                                        msf::MSFBuilder &Msf)
     : MSF(Msf), ModuleName(ModuleName) {
+  ::memset(&Layout, 0, sizeof(Layout));
   Layout.Mod = ModIndex;
 }
 
@@ -102,6 +103,7 @@ template <typename T> struct Foo {
 template <typename T> Foo<T> makeFoo(T &&t) { return Foo<T>(std::move(t)); }
 
 void DbiModuleDescriptorBuilder::finalize() {
+  Layout.SC.ModuleIndex = Layout.Mod;
   Layout.FileNameOffs = 0; // TODO: Fix this
   Layout.Flags = 0;        // TODO: Fix this
   Layout.C11Bytes = 0;
@@ -182,3 +184,9 @@ void DbiModuleDescriptorBuilder::addDebugSubsection(
   C13Builders.push_back(llvm::make_unique<DebugSubsectionRecordBuilder>(
       std::move(Subsection), CodeViewContainer::Pdb));
 }
+
+void DbiModuleDescriptorBuilder::addDebugSubsection(
+    const DebugSubsectionRecord &SubsectionContents) {
+  C13Builders.push_back(llvm::make_unique<DebugSubsectionRecordBuilder>(
+      SubsectionContents, CodeViewContainer::Pdb));
+}
diff --git a/lib/DebugInfo/PDB/Native/DbiStream.cpp b/lib/DebugInfo/PDB/Native/DbiStream.cpp
index 24322d942facc..a1f0671dec3e6 100644
--- a/lib/DebugInfo/PDB/Native/DbiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiStream.cpp
@@ -99,29 +99,27 @@ Error DbiStream::reload() {
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "DBI type server substream not aligned.");
 
-  BinaryStreamRef ModInfoSubstream;
-  BinaryStreamRef FileInfoSubstream;
-  if (auto EC =
-          Reader.readStreamRef(ModInfoSubstream, Header->ModiSubstreamSize))
+  if (auto EC = Reader.readSubstream(ModiSubstream, Header->ModiSubstreamSize))
     return EC;
 
-  if (auto EC = Reader.readStreamRef(SecContrSubstream,
+  if (auto EC = Reader.readSubstream(SecContrSubstream,
                                      Header->SecContrSubstreamSize))
     return EC;
-  if (auto EC = Reader.readStreamRef(SecMapSubstream, Header->SectionMapSize))
+  if (auto EC = Reader.readSubstream(SecMapSubstream, Header->SectionMapSize))
     return EC;
-  if (auto EC = Reader.readStreamRef(FileInfoSubstream, Header->FileInfoSize))
+  if (auto EC = Reader.readSubstream(FileInfoSubstream, Header->FileInfoSize))
     return EC;
   if (auto EC =
-          Reader.readStreamRef(TypeServerMapSubstream, Header->TypeServerSize))
+          Reader.readSubstream(TypeServerMapSubstream, Header->TypeServerSize))
     return EC;
-  if (auto EC = Reader.readStreamRef(ECSubstream, Header->ECSubstreamSize))
+  if (auto EC = Reader.readSubstream(ECSubstream, Header->ECSubstreamSize))
     return EC;
   if (auto EC = Reader.readArray(
           DbgStreams, Header->OptionalDbgHdrSize / sizeof(ulittle16_t)))
     return EC;
 
-  if (auto EC = Modules.initialize(ModInfoSubstream, FileInfoSubstream))
+  if (auto EC = Modules.initialize(ModiSubstream.StreamData,
+                                   FileInfoSubstream.StreamData))
     return EC;
 
   if (auto EC = initializeSectionContributionData())
@@ -137,8 +135,8 @@ Error DbiStream::reload() {
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Found unexpected bytes in DBI Stream.");
 
-  if (ECSubstream.getLength() > 0) {
-    BinaryStreamReader ECReader(ECSubstream);
+  if (!ECSubstream.empty()) {
+    BinaryStreamReader ECReader(ECSubstream.StreamData);
     if (auto EC = ECNames.reload(ECReader))
       return EC;
   }
@@ -228,10 +226,10 @@ void DbiStream::visitSectionContributions(
 }
 
 Error DbiStream::initializeSectionContributionData() {
-  if (SecContrSubstream.getLength() == 0)
+  if (SecContrSubstream.empty())
     return Error::success();
 
-  BinaryStreamReader SCReader(SecContrSubstream);
+  BinaryStreamReader SCReader(SecContrSubstream.StreamData);
   if (auto EC = SCReader.readEnum(SectionContribVersion))
     return EC;
 
@@ -302,11 +300,33 @@ Error DbiStream::initializeFpoRecords() {
   return Error::success();
 }
 
+BinarySubstreamRef DbiStream::getSectionContributionData() const {
+  return SecContrSubstream;
+}
+
+BinarySubstreamRef DbiStream::getSecMapSubstreamData() const {
+  return SecMapSubstream;
+}
+
+BinarySubstreamRef DbiStream::getModiSubstreamData() const {
+  return ModiSubstream;
+}
+
+BinarySubstreamRef DbiStream::getFileInfoSubstreamData() const {
+  return FileInfoSubstream;
+}
+
+BinarySubstreamRef DbiStream::getTypeServerMapSubstreamData() const {
+  return TypeServerMapSubstream;
+}
+
+BinarySubstreamRef DbiStream::getECSubstreamData() const { return ECSubstream; }
+
 Error DbiStream::initializeSectionMapData() {
-  if (SecMapSubstream.getLength() == 0)
+  if (SecMapSubstream.empty())
     return Error::success();
 
-  BinaryStreamReader SMReader(SecMapSubstream);
+  BinaryStreamReader SMReader(SecMapSubstream.StreamData);
   const SecMapHeader *Header;
   if (auto EC = SMReader.readObject(Header))
     return EC;
diff --git a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index e7304b444f23f..aad247ea185f2 100644
--- a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -90,10 +90,14 @@ Error DbiStreamBuilder::addModuleSourceFile(StringRef Module, StringRef File) {
   if (ModIter == ModiMap.end())
     return make_error<RawError>(raw_error_code::no_entry,
                                 "The specified module was not found");
+  return addModuleSourceFile(*ModIter->second, File);
+}
+
+Error DbiStreamBuilder::addModuleSourceFile(DbiModuleDescriptorBuilder &Module,
+                                            StringRef File) {
   uint32_t Index = SourceFileNames.size();
   SourceFileNames.insert(std::make_pair(File, Index));
-  auto &ModEntry = *ModIter;
-  ModEntry.second->addSourceFile(File);
+  Module.addSourceFile(File);
   return Error::success();
 }
 
@@ -233,6 +237,7 @@ Error DbiStreamBuilder::finalize() {
     return EC;
 
   DbiStreamHeader *H = Allocator.Allocate<DbiStreamHeader>();
+  ::memset(H, 0, sizeof(DbiStreamHeader));
   H->VersionHeader = *VerHeader;
   H->VersionSignature = -1;
   H->Age = Age;
diff --git a/lib/DebugInfo/PDB/Native/InfoStream.cpp b/lib/DebugInfo/PDB/Native/InfoStream.cpp
index a3979d480bf45..21b66b3e7bcff 100644
--- a/lib/DebugInfo/PDB/Native/InfoStream.cpp
+++ b/lib/DebugInfo/PDB/Native/InfoStream.cpp
@@ -57,6 +57,10 @@ Error InfoStream::reload() {
   uint32_t NewOffset = Reader.getOffset();
   NamedStreamMapByteSize = NewOffset - Offset;
 
+  Reader.setOffset(Offset);
+  if (auto EC = Reader.readSubstream(SubNamedStreams, NamedStreamMapByteSize))
+    return EC;
+
   bool Stop = false;
   while (!Stop && !Reader.empty()) {
     PdbRaw_FeatureSig Sig;
@@ -129,3 +133,7 @@ ArrayRef<PdbRaw_FeatureSig> InfoStream::getFeatureSignatures() const {
 const NamedStreamMap &InfoStream::getNamedStreams() const {
   return NamedStreams;
 }
+
+BinarySubstreamRef InfoStream::getNamedStreamsBuffer() const {
+  return SubNamedStreams;
+}
diff --git a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
index 4186f2eb6ba01..83c56574a16e5 100644
--- a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
+++ b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
@@ -47,15 +47,19 @@ Error ModuleDebugStreamRef::reload() {
 
   if (auto EC = Reader.readInteger(Signature))
     return EC;
-  if (auto EC = Reader.readArray(SymbolsSubstream, SymbolSize - 4))
+  if (auto EC = Reader.readSubstream(SymbolsSubstream, SymbolSize - 4))
     return EC;
-
-  if (auto EC = Reader.readStreamRef(C11LinesSubstream, C11Size))
+  if (auto EC = Reader.readSubstream(C11LinesSubstream, C11Size))
+    return EC;
+  if (auto EC = Reader.readSubstream(C13LinesSubstream, C13Size))
     return EC;
-  if (auto EC = Reader.readStreamRef(C13LinesSubstream, C13Size))
+
+  BinaryStreamReader SymbolReader(SymbolsSubstream.StreamData);
+  if (auto EC =
+          SymbolReader.readArray(SymbolArray, SymbolReader.bytesRemaining()))
     return EC;
 
-  BinaryStreamReader SubsectionsReader(C13LinesSubstream);
+  BinaryStreamReader SubsectionsReader(C13LinesSubstream.StreamData);
   if (auto EC = SubsectionsReader.readArray(Subsections,
                                             SubsectionsReader.bytesRemaining()))
     return EC;
@@ -63,7 +67,7 @@ Error ModuleDebugStreamRef::reload() {
   uint32_t GlobalRefsSize;
   if (auto EC = Reader.readInteger(GlobalRefsSize))
     return EC;
-  if (auto EC = Reader.readStreamRef(GlobalRefsSubstream, GlobalRefsSize))
+  if (auto EC = Reader.readSubstream(GlobalRefsSubstream, GlobalRefsSize))
     return EC;
   if (Reader.bytesRemaining() > 0)
     return make_error<RawError>(raw_error_code::corrupt_file,
@@ -72,9 +76,25 @@ Error ModuleDebugStreamRef::reload() {
   return Error::success();
 }
 
+BinarySubstreamRef ModuleDebugStreamRef::getSymbolsSubstream() const {
+  return SymbolsSubstream;
+}
+
+BinarySubstreamRef ModuleDebugStreamRef::getC11LinesSubstream() const {
+  return C11LinesSubstream;
+}
+
+BinarySubstreamRef ModuleDebugStreamRef::getC13LinesSubstream() const {
+  return C13LinesSubstream;
+}
+
+BinarySubstreamRef ModuleDebugStreamRef::getGlobalRefsSubstream() const {
+  return GlobalRefsSubstream;
+}
+
 iterator_range<codeview::CVSymbolArray::Iterator>
 ModuleDebugStreamRef::symbols(bool *HadError) const {
-  return make_range(SymbolsSubstream.begin(HadError), SymbolsSubstream.end());
+  return make_range(SymbolArray.begin(HadError), SymbolArray.end());
 }
 
 llvm::iterator_range<ModuleDebugStreamRef::DebugSubsectionIterator>
@@ -83,7 +103,7 @@ ModuleDebugStreamRef::subsections() const {
 }
 
 bool ModuleDebugStreamRef::hasDebugSubsections() const {
-  return C13LinesSubstream.getLength() > 0;
+  return !C13LinesSubstream.empty();
 }
 
 Error ModuleDebugStreamRef::commit() { return Error::success(); }
diff --git a/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp b/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
index c7ba32b82bc6b..4f90cd9cd8ac0 100644
--- a/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
+++ b/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
@@ -23,6 +23,14 @@
 using namespace llvm;
 using namespace llvm::pdb;
 
+// FIXME: This shouldn't be necessary, but if we insert the strings in any
+// other order, cvdump cannot read the generated name map.  This suggests that
+// we may be using the wrong hash function.  A closer inspection of the cvdump
+// source code may reveal something, but for now this at least makes us work,
+// even if only by accident.
+static constexpr const char *OrderedStreamNames[] = {"/LinkInfo", "/names",
+                                                     "/src/headerblock"};
+
 NamedStreamMap::NamedStreamMap() = default;
 
 Error NamedStreamMap::load(BinaryStreamReader &Stream) {
@@ -73,9 +81,10 @@ Error NamedStreamMap::commit(BinaryStreamWriter &Writer) const {
   if (auto EC = Writer.writeInteger(FinalizedInfo->StringDataBytes))
     return EC;
 
-  // Now all of the string data itself.
-  for (const auto &Item : Mapping) {
-    if (auto EC = Writer.writeCString(Item.getKey()))
+  for (const auto &Name : OrderedStreamNames) {
+    auto Item = Mapping.find(Name);
+    assert(Item != Mapping.end());
+    if (auto EC = Writer.writeCString(Item->getKey()))
       return EC;
   }
 
@@ -93,9 +102,12 @@ uint32_t NamedStreamMap::finalize() {
   // Build the finalized hash table.
   FinalizedHashTable.clear();
   FinalizedInfo.emplace();
-  for (const auto &Item : Mapping) {
-    FinalizedHashTable.set(FinalizedInfo->StringDataBytes, Item.getValue());
-    FinalizedInfo->StringDataBytes += Item.getKeyLength() + 1;
+
+  for (const auto &Name : OrderedStreamNames) {
+    auto Item = Mapping.find(Name);
+    assert(Item != Mapping.end());
+    FinalizedHashTable.set(FinalizedInfo->StringDataBytes, Item->getValue());
+    FinalizedInfo->StringDataBytes += Item->getKeyLength() + 1;
   }
 
   // Number of bytes of string data.
diff --git a/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
index 77f832582f824..180c169ec209c 100644
--- a/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
@@ -9,17 +9,24 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 
+#include "llvm/ADT/STLExtras.h"
+
 namespace llvm {
 namespace pdb {
 
 NativeCompilandSymbol::NativeCompilandSymbol(NativeSession &Session,
+                                             uint32_t SymbolId,
                                              DbiModuleDescriptor MI)
-    : NativeRawSymbol(Session), Module(MI) {}
+    : NativeRawSymbol(Session, SymbolId), Module(MI) {}
 
 PDB_SymType NativeCompilandSymbol::getSymTag() const {
   return PDB_SymType::Compiland;
 }
 
+std::unique_ptr<NativeRawSymbol> NativeCompilandSymbol::clone() const {
+  return llvm::make_unique<NativeCompilandSymbol>(Session, SymbolId, Module);
+}
+
 bool NativeCompilandSymbol::isEditAndContinueEnabled() const {
   return Module.hasECInfo();
 }
diff --git a/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp b/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
index 97319fd77d117..c23120041164a 100644
--- a/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
@@ -34,7 +34,7 @@ NativeEnumModules::getChildAtIndex(uint32_t Index) const {
     return nullptr;
   return std::unique_ptr<PDBSymbol>(new PDBSymbolCompiland(
       Session, std::unique_ptr<IPDBRawSymbol>(new NativeCompilandSymbol(
-                   Session, Modules.getModuleDescriptor(Index)))));
+                   Session, 0, Modules.getModuleDescriptor(Index)))));
 }
 
 std::unique_ptr<PDBSymbol> NativeEnumModules::getNext() {
diff --git a/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
index bb52560be167a..6206155b9fb64 100644
--- a/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
@@ -17,8 +18,12 @@
 namespace llvm {
 namespace pdb {
 
-NativeExeSymbol::NativeExeSymbol(NativeSession &Session)
-    : NativeRawSymbol(Session), File(Session.getPDBFile()) {}
+NativeExeSymbol::NativeExeSymbol(NativeSession &Session, uint32_t SymbolId)
+    : NativeRawSymbol(Session, SymbolId), File(Session.getPDBFile()) {}
+
+std::unique_ptr<NativeRawSymbol> NativeExeSymbol::clone() const {
+  return llvm::make_unique<NativeExeSymbol>(Session, SymbolId);
+}
 
 std::unique_ptr<IPDBEnumSymbols>
 NativeExeSymbol::findChildren(PDB_SymType Type) const {
diff --git a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
index 70968d4330b07..ed6db63edbabf 100644
--- a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
@@ -22,8 +22,8 @@
 using namespace llvm;
 using namespace llvm::pdb;
 
-NativeRawSymbol::NativeRawSymbol(NativeSession &PDBSession)
-  : Session(PDBSession) {}
+NativeRawSymbol::NativeRawSymbol(NativeSession &PDBSession, uint32_t SymbolId)
+    : Session(PDBSession), SymbolId(SymbolId) {}
 
 void NativeRawSymbol::dump(raw_ostream &OS, int Indent) const {}
 
@@ -253,9 +253,7 @@ uint32_t NativeRawSymbol::getSubTypeId() const {
 
 std::string NativeRawSymbol::getSymbolsFileName() const { return ""; }
 
-uint32_t NativeRawSymbol::getSymIndexId() const {
-  return 0;
-}
+uint32_t NativeRawSymbol::getSymIndexId() const { return SymbolId; }
 
 uint32_t NativeRawSymbol::getTargetOffset() const {
   return 0;
diff --git a/lib/DebugInfo/PDB/Native/NativeSession.cpp b/lib/DebugInfo/PDB/Native/NativeSession.cpp
index 7e6843bceb7db..3ab381e76e628 100644
--- a/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -70,12 +70,11 @@ uint64_t NativeSession::getLoadAddress() const { return 0; }
 
 void NativeSession::setLoadAddress(uint64_t Address) {}
 
-std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() const {
-  auto RawSymbol =
-      llvm::make_unique<NativeExeSymbol>(const_cast<NativeSession &>(*this));
+std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() {
+  auto RawSymbol = llvm::make_unique<NativeExeSymbol>(*this, 0);
   auto PdbSymbol(PDBSymbol::create(*this, std::move(RawSymbol)));
   std::unique_ptr<PDBSymbolExe> ExeSymbol(
-    static_cast<PDBSymbolExe *>(PdbSymbol.release()));
+      static_cast<PDBSymbolExe *>(PdbSymbol.release()));
   return ExeSymbol;
 }
 
diff --git a/lib/DebugInfo/PDB/Native/PDBFile.cpp b/lib/DebugInfo/PDB/Native/PDBFile.cpp
index a9597cdf4c4d3..4f6ebb0cb3428 100644
--- a/lib/DebugInfo/PDB/Native/PDBFile.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFile.cpp
@@ -230,6 +230,14 @@ ArrayRef<support::ulittle32_t> PDBFile::getDirectoryBlockArray() const {
   return ContainerLayout.DirectoryBlocks;
 }
 
+MSFStreamLayout PDBFile::getStreamLayout(uint32_t StreamIdx) const {
+  MSFStreamLayout Result;
+  auto Blocks = getStreamBlockList(StreamIdx);
+  Result.Blocks.assign(Blocks.begin(), Blocks.end());
+  Result.Length = getStreamByteSize(StreamIdx);
+  return Result;
+}
+
 Expected<GlobalsStream &> PDBFile::getPDBGlobalsStream() {
   if (!Globals) {
     auto DbiS = getPDBDbiStream();
diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index 67c803d3124ec..f917ef91f6396 100644
--- a/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -66,7 +66,13 @@ Error TpiStream::reload() {
                                 "TPI Stream Invalid number of hash buckets.");
 
   // The actual type records themselves come from this stream
-  if (auto EC = Reader.readArray(TypeRecords, Header->TypeRecordBytes))
+  if (auto EC =
+          Reader.readSubstream(TypeRecordsSubstream, Header->TypeRecordBytes))
+    return EC;
+
+  BinaryStreamReader RecordReader(TypeRecordsSubstream.StreamData);
+  if (auto EC =
+          RecordReader.readArray(TypeRecords, TypeRecordsSubstream.size()))
     return EC;
 
   // Hash indices, hash values, etc come from the hash stream.
@@ -135,6 +141,10 @@ uint16_t TpiStream::getTypeHashStreamAuxIndex() const {
 uint32_t TpiStream::getNumHashBuckets() const { return Header->NumHashBuckets; }
 uint32_t TpiStream::getHashKeySize() const { return Header->HashKeySize; }
 
+BinarySubstreamRef TpiStream::getTypeRecordsSubstream() const {
+  return TypeRecordsSubstream;
+}
+
 FixedStreamArray<support::ulittle32_t> TpiStream::getHashValues() const {
   return HashValues;
 }
diff --git a/lib/ExecutionEngine/Orc/OrcCBindings.cpp b/lib/ExecutionEngine/Orc/OrcCBindings.cpp
index 8dcd49aaab5bb..5fe259f80b6fb 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindings.cpp
+++ b/lib/ExecutionEngine/Orc/OrcCBindings.cpp
@@ -12,6 +12,24 @@
 
 using namespace llvm;
 
+LLVMSharedModuleRef LLVMOrcMakeSharedModule(LLVMModuleRef Mod) {
+  return wrap(new std::shared_ptr<Module>(unwrap(Mod)));
+}
+
+void LLVMOrcDisposeSharedModuleRef(LLVMSharedModuleRef SharedMod) {
+  delete unwrap(SharedMod);
+}
+
+LLVMSharedObjectBufferRef
+LLVMOrcMakeSharedObjectBuffer(LLVMMemoryBufferRef ObjBuffer) {
+  return wrap(new std::shared_ptr<MemoryBuffer>(unwrap(ObjBuffer)));
+}
+
+void
+LLVMOrcDisposeSharedObjectBufferRef(LLVMSharedObjectBufferRef SharedObjBuffer) {
+  delete unwrap(SharedObjBuffer);
+}
+
 LLVMOrcJITStackRef LLVMOrcCreateInstance(LLVMTargetMachineRef TM) {
   TargetMachine *TM2(unwrap(TM));
 
@@ -65,21 +83,23 @@ LLVMOrcErrorCode LLVMOrcSetIndirectStubPointer(LLVMOrcJITStackRef JITStack,
 }
 
 LLVMOrcModuleHandle
-LLVMOrcAddEagerlyCompiledIR(LLVMOrcJITStackRef JITStack, LLVMModuleRef Mod,
+LLVMOrcAddEagerlyCompiledIR(LLVMOrcJITStackRef JITStack,
+                            LLVMSharedModuleRef Mod,
                             LLVMOrcSymbolResolverFn SymbolResolver,
                             void *SymbolResolverCtx) {
   OrcCBindingsStack &J = *unwrap(JITStack);
-  Module *M(unwrap(Mod));
-  return J.addIRModuleEager(M, SymbolResolver, SymbolResolverCtx);
+  std::shared_ptr<Module> *M(unwrap(Mod));
+  return J.addIRModuleEager(*M, SymbolResolver, SymbolResolverCtx);
 }
 
 LLVMOrcModuleHandle
-LLVMOrcAddLazilyCompiledIR(LLVMOrcJITStackRef JITStack, LLVMModuleRef Mod,
+LLVMOrcAddLazilyCompiledIR(LLVMOrcJITStackRef JITStack,
+                           LLVMSharedModuleRef Mod,
                            LLVMOrcSymbolResolverFn SymbolResolver,
                            void *SymbolResolverCtx) {
   OrcCBindingsStack &J = *unwrap(JITStack);
-  Module *M(unwrap(Mod));
-  return J.addIRModuleLazy(M, SymbolResolver, SymbolResolverCtx);
+  std::shared_ptr<Module> *M(unwrap(Mod));
+  return J.addIRModuleLazy(*M, SymbolResolver, SymbolResolverCtx);
 }
 
 void LLVMOrcRemoveModule(LLVMOrcJITStackRef JITStack, LLVMOrcModuleHandle H) {
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index a79dd844bf4f7..931d0a9eb2ade 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -1,4 +1,4 @@
-//===--- OrcCBindingsStack.h - Orc JIT stack for C bindings ---*- C++ -*---===//
+//===- OrcCBindingsStack.h - Orc JIT stack for C bindings -----*- C++ -*---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,39 +11,63 @@
 #define LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H
 
 #include "llvm-c/OrcBindings.h"
-#include "llvm/ADT/Triple.h"
+#include "llvm-c/TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
 
 namespace llvm {
 
 class OrcCBindingsStack;
 
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(std::shared_ptr<Module>,
+                                   LLVMSharedModuleRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(std::shared_ptr<MemoryBuffer>,
+                                   LLVMSharedObjectBufferRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(OrcCBindingsStack, LLVMOrcJITStackRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
 
 class OrcCBindingsStack {
 public:
-  typedef orc::JITCompileCallbackManager CompileCallbackMgr;
-  typedef orc::RTDyldObjectLinkingLayer<> ObjLayerT;
-  typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT;
-  typedef orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>
-      CODLayerT;
 
-  typedef std::function<std::unique_ptr<CompileCallbackMgr>()>
-      CallbackManagerBuilder;
+  using CompileCallbackMgr = orc::JITCompileCallbackManager;
+  using ObjLayerT = orc::RTDyldObjectLinkingLayer;
+  using CompileLayerT = orc::IRCompileLayer<ObjLayerT, orc::SimpleCompiler>;
+  using CODLayerT =
+        orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>;
 
-  typedef CODLayerT::IndirectStubsManagerBuilderT IndirectStubsManagerBuilder;
+  using CallbackManagerBuilder =
+      std::function<std::unique_ptr<CompileCallbackMgr>()>;
+
+  using IndirectStubsManagerBuilder = CODLayerT::IndirectStubsManagerBuilderT;
 
 private:
   class GenericHandle {
   public:
-    virtual ~GenericHandle() {}
+    virtual ~GenericHandle() = default;
+
     virtual JITSymbol findSymbolIn(const std::string &Name,
                                    bool ExportedSymbolsOnly) = 0;
     virtual void removeModule() = 0;
@@ -51,7 +75,7 @@ private:
 
   template <typename LayerT> class GenericHandleImpl : public GenericHandle {
   public:
-    GenericHandleImpl(LayerT &Layer, typename LayerT::ModuleSetHandleT Handle)
+    GenericHandleImpl(LayerT &Layer, typename LayerT::ModuleHandleT Handle)
         : Layer(Layer), Handle(std::move(Handle)) {}
 
     JITSymbol findSymbolIn(const std::string &Name,
@@ -59,31 +83,28 @@ private:
       return Layer.findSymbolIn(Handle, Name, ExportedSymbolsOnly);
     }
 
-    void removeModule() override { return Layer.removeModuleSet(Handle); }
+    void removeModule() override { return Layer.removeModule(Handle); }
 
   private:
     LayerT &Layer;
-    typename LayerT::ModuleSetHandleT Handle;
+    typename LayerT::ModuleHandleT Handle;
   };
 
   template <typename LayerT>
   std::unique_ptr<GenericHandleImpl<LayerT>>
-  createGenericHandle(LayerT &Layer, typename LayerT::ModuleSetHandleT Handle) {
+  createGenericHandle(LayerT &Layer, typename LayerT::ModuleHandleT Handle) {
     return llvm::make_unique<GenericHandleImpl<LayerT>>(Layer,
                                                         std::move(Handle));
   }
 
 public:
-  // We need a 'ModuleSetHandleT' to conform to the layer concept.
-  typedef unsigned ModuleSetHandleT;
-
-  typedef unsigned ModuleHandleT;
+  using ModuleHandleT = unsigned;
 
   OrcCBindingsStack(TargetMachine &TM,
                     std::unique_ptr<CompileCallbackMgr> CCMgr,
                     IndirectStubsManagerBuilder IndirectStubsMgrBuilder)
       : DL(TM.createDataLayout()), IndirectStubsMgr(IndirectStubsMgrBuilder()),
-        CCMgr(std::move(CCMgr)), ObjectLayer(),
+        CCMgr(std::move(CCMgr)),
         CompileLayer(ObjectLayer, orc::SimpleCompiler(TM)),
         CODLayer(CompileLayer,
                  [](Function &F) { return std::set<Function *>({&F}); },
@@ -153,7 +174,7 @@ public:
           if (ExternalResolver)
             return JITSymbol(
                 ExternalResolver(Name.c_str(), ExternalResolverCtx),
-                llvm::JITSymbolFlags::Exported);
+                JITSymbolFlags::Exported);
 
           return JITSymbol(nullptr);
         },
@@ -163,11 +184,10 @@ public:
   }
 
   template <typename LayerT>
-  ModuleHandleT addIRModule(LayerT &Layer, Module *M,
+  ModuleHandleT addIRModule(LayerT &Layer, std::shared_ptr<Module> M,
                             std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr,
                             LLVMOrcSymbolResolverFn ExternalResolver,
                             void *ExternalResolverCtx) {
-
     // Attach a data-layout if one isn't already present.
     if (M->getDataLayout().isDefault())
       M->setDataLayout(DL);
@@ -184,11 +204,8 @@ public:
     auto Resolver = createResolver(ExternalResolver, ExternalResolverCtx);
 
     // Add the module to the JIT.
-    std::vector<Module *> S;
-    S.push_back(std::move(M));
-
-    auto LH = Layer.addModuleSet(std::move(S), std::move(MemMgr),
-                                 std::move(Resolver));
+    auto LH = Layer.addModule(std::move(M), std::move(MemMgr),
+                              std::move(Resolver));
     ModuleHandleT H = createHandle(Layer, LH);
 
     // Run the static constructors, and save the static destructor runner for
@@ -201,7 +218,7 @@ public:
     return H;
   }
 
-  ModuleHandleT addIRModuleEager(Module *M,
+  ModuleHandleT addIRModuleEager(std::shared_ptr<Module> M,
                                  LLVMOrcSymbolResolverFn ExternalResolver,
                                  void *ExternalResolverCtx) {
     return addIRModule(CompileLayer, std::move(M),
@@ -209,7 +226,7 @@ public:
                        std::move(ExternalResolver), ExternalResolverCtx);
   }
 
-  ModuleHandleT addIRModuleLazy(Module *M,
+  ModuleHandleT addIRModuleLazy(std::shared_ptr<Module> M,
                                 LLVMOrcSymbolResolverFn ExternalResolver,
                                 void *ExternalResolverCtx) {
     return addIRModule(CODLayer, std::move(M),
@@ -238,8 +255,7 @@ public:
 
 private:
   template <typename LayerT>
-  unsigned createHandle(LayerT &Layer,
-                        typename LayerT::ModuleSetHandleT Handle) {
+  unsigned createHandle(LayerT &Layer, typename LayerT::ModuleHandleT Handle) {
     unsigned NewHandle;
     if (!FreeHandleIndexes.empty()) {
       NewHandle = FreeHandleIndexes.back();
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
index b7a68e041c128..f89f21adff417 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
@@ -124,5 +124,10 @@ OrcMCJITReplacement::runFunction(Function *F,
   llvm_unreachable("Full-featured argument passing not supported yet!");
 }
 
+void OrcMCJITReplacement::runStaticConstructorsDestructors(bool isDtors) {
+  for (auto &M : LocalModules)
+    ExecutionEngine::runStaticConstructorsDestructors(*M, isDtors);
+}
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index 7dd6b17d33cb4..b20690c7caafc 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -1,4 +1,4 @@
-//===---- OrcMCJITReplacement.h - Orc based MCJIT replacement ---*- C++ -*-===//
+//===- OrcMCJITReplacement.h - Orc based MCJIT replacement ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -24,9 +24,12 @@
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ObjectFile.h"
@@ -45,6 +48,9 @@
 #include <vector>
 
 namespace llvm {
+
+class ObjectCache;
+
 namespace orc {
 
 class OrcMCJITReplacement : public ExecutionEngine {
@@ -151,7 +157,6 @@ class OrcMCJITReplacement : public ExecutionEngine {
   };
 
 private:
-
   static ExecutionEngine *
   createOrcMCJITReplacement(std::string *ErrorMsg,
                             std::shared_ptr<MCJITMemoryManager> MemMgr,
@@ -162,10 +167,6 @@ private:
   }
 
 public:
-  static void Register() {
-    OrcMCJITReplacementCtor = createOrcMCJITReplacement;
-  }
-
   OrcMCJITReplacement(
       std::shared_ptr<MCJITMemoryManager> MemMgr,
       std::shared_ptr<JITSymbolResolver> ClientResolver,
@@ -178,8 +179,11 @@ public:
         CompileLayer(ObjectLayer, SimpleCompiler(*this->TM)),
         LazyEmitLayer(CompileLayer) {}
 
-  void addModule(std::unique_ptr<Module> M) override {
+  static void Register() {
+    OrcMCJITReplacementCtor = createOrcMCJITReplacement;
+  }
 
+  void addModule(std::unique_ptr<Module> M) override {
     // If this module doesn't have a DataLayout attached then attach the
     // default.
     if (M->getDataLayout().isDefault()) {
@@ -187,29 +191,44 @@ public:
     } else {
       assert(M->getDataLayout() == getDataLayout() && "DataLayout Mismatch");
     }
-    Modules.push_back(std::move(M));
-    std::vector<Module *> Ms;
-    Ms.push_back(&*Modules.back());
-    LazyEmitLayer.addModuleSet(std::move(Ms), &MemMgr, &Resolver);
+    auto *MPtr = M.release();
+    ShouldDelete[MPtr] = true;
+    auto Deleter =
+      [this](Module *Mod) {
+        if (ShouldDelete[Mod])
+	  delete Mod;
+      };
+    LocalModules.push_back(std::shared_ptr<Module>(MPtr, std::move(Deleter)));
+    LazyEmitLayer.addModule(LocalModules.back(), &MemMgr, &Resolver);
   }
 
   void addObjectFile(std::unique_ptr<object::ObjectFile> O) override {
-    std::vector<std::unique_ptr<object::ObjectFile>> Objs;
-    Objs.push_back(std::move(O));
-    ObjectLayer.addObjectSet(std::move(Objs), &MemMgr, &Resolver);
+    auto Obj =
+      std::make_shared<object::OwningBinary<object::ObjectFile>>(std::move(O),
+                                                                 nullptr);
+    ObjectLayer.addObject(std::move(Obj), &MemMgr, &Resolver);
   }
 
   void addObjectFile(object::OwningBinary<object::ObjectFile> O) override {
-    std::vector<std::unique_ptr<object::OwningBinary<object::ObjectFile>>> Objs;
-    Objs.push_back(
-      llvm::make_unique<object::OwningBinary<object::ObjectFile>>(
-        std::move(O)));
-    ObjectLayer.addObjectSet(std::move(Objs), &MemMgr, &Resolver);
+    auto Obj =
+      std::make_shared<object::OwningBinary<object::ObjectFile>>(std::move(O));
+    ObjectLayer.addObject(std::move(Obj), &MemMgr, &Resolver);
   }
 
   void addArchive(object::OwningBinary<object::Archive> A) override {
     Archives.push_back(std::move(A));
   }
+  
+  bool removeModule(Module *M) override {
+    for (auto I = LocalModules.begin(), E = LocalModules.end(); I != E; ++I) {
+      if (I->get() == M) {
+	ShouldDelete[M] = false;
+	LocalModules.erase(I);
+	return true;
+      }
+    }
+    return false;
+  }
 
   uint64_t getSymbolAddress(StringRef Name) {
     return findSymbol(Name).getAddress();
@@ -256,13 +275,15 @@ public:
                            ArrayRef<GenericValue> ArgValues) override;
 
   void setObjectCache(ObjectCache *NewCache) override {
-    CompileLayer.setObjectCache(NewCache);
+    CompileLayer.getCompiler().setObjectCache(NewCache);
   }
 
   void setProcessAllSections(bool ProcessAllSections) override {
     ObjectLayer.setProcessAllSections(ProcessAllSections);
   }
 
+  void runStaticConstructorsDestructors(bool isDtors) override;
+
 private:
   JITSymbol findMangledSymbol(StringRef Name) {
     if (auto Sym = LazyEmitLayer.findSymbol(Name, false))
@@ -294,10 +315,12 @@ private:
         }
         std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get();
         if (ChildBin->isObject()) {
-          std::vector<std::unique_ptr<object::ObjectFile>> ObjSet;
-          ObjSet.push_back(std::unique_ptr<object::ObjectFile>(
-              static_cast<object::ObjectFile *>(ChildBin.release())));
-          ObjectLayer.addObjectSet(std::move(ObjSet), &MemMgr, &Resolver);
+          std::unique_ptr<object::ObjectFile> ChildObj(
+            static_cast<object::ObjectFile*>(ChildBinOrErr->release()));
+          auto Obj =
+            std::make_shared<object::OwningBinary<object::ObjectFile>>(
+              std::move(ChildObj), nullptr);
+          ObjectLayer.addObject(std::move(Obj), &MemMgr, &Resolver);
           if (auto Sym = ObjectLayer.findSymbol(Name, true))
             return Sym;
         }
@@ -308,34 +331,19 @@ private:
 
   class NotifyObjectLoadedT {
   public:
-    typedef std::vector<std::unique_ptr<RuntimeDyld::LoadedObjectInfo>>
-        LoadedObjInfoListT;
+    using LoadedObjInfoListT =
+        std::vector<std::unique_ptr<RuntimeDyld::LoadedObjectInfo>>;
 
     NotifyObjectLoadedT(OrcMCJITReplacement &M) : M(M) {}
 
-    template <typename ObjListT>
-    void operator()(RTDyldObjectLinkingLayerBase::ObjSetHandleT H,
-                    const ObjListT &Objects,
-                    const LoadedObjInfoListT &Infos) const {
+    void operator()(RTDyldObjectLinkingLayerBase::ObjHandleT H,
+                    const RTDyldObjectLinkingLayer::ObjectPtr &Obj,
+                    const LoadedObjectInfo &Info) const {
       M.UnfinalizedSections[H] = std::move(M.SectionsAllocatedSinceLastLoad);
       M.SectionsAllocatedSinceLastLoad = SectionAddrSet();
-      assert(Objects.size() == Infos.size() &&
-             "Incorrect number of Infos for Objects.");
-      for (unsigned I = 0; I < Objects.size(); ++I)
-        M.MemMgr.notifyObjectLoaded(&M, getObject(*Objects[I]));
+      M.MemMgr.notifyObjectLoaded(&M, *Obj->getBinary());
     }
-
   private:
-    static const object::ObjectFile& getObject(const object::ObjectFile &Obj) {
-      return Obj;
-    }
-
-    template <typename ObjT>
-    static const object::ObjectFile&
-    getObject(const object::OwningBinary<ObjT> &Obj) {
-      return *Obj.getBinary();
-    }
-
     OrcMCJITReplacement &M;
   };
 
@@ -343,7 +351,7 @@ private:
   public:
     NotifyFinalizedT(OrcMCJITReplacement &M) : M(M) {}
 
-    void operator()(RTDyldObjectLinkingLayerBase::ObjSetHandleT H) {
+    void operator()(RTDyldObjectLinkingLayerBase::ObjHandleT H) {
       M.UnfinalizedSections.erase(H);
     }
 
@@ -360,9 +368,9 @@ private:
     return MangledName;
   }
 
-  typedef RTDyldObjectLinkingLayer<NotifyObjectLoadedT> ObjectLayerT;
-  typedef IRCompileLayer<ObjectLayerT> CompileLayerT;
-  typedef LazyEmittingLayer<CompileLayerT> LazyEmitLayerT;
+  using ObjectLayerT = RTDyldObjectLinkingLayer;
+  using CompileLayerT = IRCompileLayer<ObjectLayerT, orc::SimpleCompiler>;
+  using LazyEmitLayerT = LazyEmittingLayer<CompileLayerT>;
 
   std::unique_ptr<TargetMachine> TM;
   MCJITReplacementMemMgr MemMgr;
@@ -380,21 +388,24 @@ private:
   // We need to store ObjLayerT::ObjSetHandles for each of the object sets
   // that have been emitted but not yet finalized so that we can forward the
   // mapSectionAddress calls appropriately.
-  typedef std::set<const void *> SectionAddrSet;
-  struct ObjSetHandleCompare {
-    bool operator()(ObjectLayerT::ObjSetHandleT H1,
-                    ObjectLayerT::ObjSetHandleT H2) const {
+  using SectionAddrSet = std::set<const void *>;
+  struct ObjHandleCompare {
+    bool operator()(ObjectLayerT::ObjHandleT H1,
+                    ObjectLayerT::ObjHandleT H2) const {
       return &*H1 < &*H2;
     }
   };
   SectionAddrSet SectionsAllocatedSinceLastLoad;
-  std::map<ObjectLayerT::ObjSetHandleT, SectionAddrSet, ObjSetHandleCompare>
+  std::map<ObjectLayerT::ObjHandleT, SectionAddrSet, ObjHandleCompare>
       UnfinalizedSections;
 
+  std::map<Module*, bool> ShouldDelete;
+  std::vector<std::shared_ptr<Module>> LocalModules;
   std::vector<object::OwningBinary<object::Archive>> Archives;
 };
 
 } // end namespace orc
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_EXECUTIONENGINE_ORC_MCJITREPLACEMENT_H
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 556e122ff82ff..c7f112887a306 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -1,5 +1,4 @@
-
-//===-- AsmWriter.cpp - Printing LLVM as an assembly file -----------------===//
+//===- AsmWriter.cpp - Printing LLVM as an assembly file ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,63 +14,105 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
+#include "llvm/IR/Use.h"
 #include "llvm/IR/UseListOrder.h"
-#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
 #include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 // Make virtual table appear in this compilation unit.
-AssemblyAnnotationWriter::~AssemblyAnnotationWriter() {}
+AssemblyAnnotationWriter::~AssemblyAnnotationWriter() = default;
 
 //===----------------------------------------------------------------------===//
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 struct OrderMap {
   DenseMap<const Value *, std::pair<unsigned, bool>> IDs;
 
   unsigned size() const { return IDs.size(); }
   std::pair<unsigned, bool> &operator[](const Value *V) { return IDs[V]; }
+
   std::pair<unsigned, bool> lookup(const Value *V) const {
     return IDs.lookup(V);
   }
+
   void index(const Value *V) {
     // Explicitly sequence get-size and insert-value operations to avoid UB.
     unsigned ID = IDs.size() + 1;
     IDs[V].first = ID;
   }
 };
-}
+
+} // end anonymous namespace
 
 static void orderValue(const Value *V, OrderMap &OM) {
   if (OM.lookup(V).first)
@@ -139,7 +180,7 @@ static void predictValueUseListOrderImpl(const Value *V, const Function *F,
                                          unsigned ID, const OrderMap &OM,
                                          UseListOrderStack &Stack) {
   // Predict use-list order for this one.
-  typedef std::pair<const Use *, unsigned> Entry;
+  using Entry = std::pair<const Use *, unsigned>;
   SmallVector<Entry, 64> List;
   for (const Use &U : V->uses())
     // Check if this user will be serialized.
@@ -421,13 +462,10 @@ static void PrintLLVMName(raw_ostream &OS, const Value *V) {
                 isa<GlobalValue>(V) ? GlobalPrefix : LocalPrefix);
 }
 
-
 namespace {
+
 class TypePrinting {
-  TypePrinting(const TypePrinting &) = delete;
-  void operator=(const TypePrinting&) = delete;
 public:
-
   /// NamedTypes - The named types that are used by the current module.
   TypeFinder NamedTypes;
 
@@ -435,6 +473,8 @@ public:
   DenseMap<StructType*, unsigned> NumberedTypes;
 
   TypePrinting() = default;
+  TypePrinting(const TypePrinting &) = delete;
+  TypePrinting &operator=(const TypePrinting &) = delete;
 
   void incorporateTypes(const Module &M);
 
@@ -442,7 +482,8 @@ public:
 
   void printStructBody(StructType *Ty, raw_ostream &OS);
 };
-} // namespace
+
+} // end anonymous namespace
 
 void TypePrinting::incorporateTypes(const Module &M) {
   NamedTypes.run(M, false);
@@ -574,6 +615,7 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
 }
 
 namespace llvm {
+
 //===----------------------------------------------------------------------===//
 // SlotTracker Class: Enumerate slot numbers for unnamed values
 //===----------------------------------------------------------------------===//
@@ -582,32 +624,33 @@ namespace llvm {
 class SlotTracker {
 public:
   /// ValueMap - A mapping of Values to slot numbers.
-  typedef DenseMap<const Value*, unsigned> ValueMap;
+  using ValueMap = DenseMap<const Value *, unsigned>;
 
 private:
   /// TheModule - The module for which we are holding slot numbers.
   const Module* TheModule;
 
   /// TheFunction - The function for which we are holding slot numbers.
-  const Function* TheFunction;
-  bool FunctionProcessed;
+  const Function* TheFunction = nullptr;
+  bool FunctionProcessed = false;
   bool ShouldInitializeAllMetadata;
 
   /// mMap - The slot map for the module level data.
   ValueMap mMap;
-  unsigned mNext;
+  unsigned mNext = 0;
 
   /// fMap - The slot map for the function level data.
   ValueMap fMap;
-  unsigned fNext;
+  unsigned fNext = 0;
 
   /// mdnMap - Map for MDNodes.
   DenseMap<const MDNode*, unsigned> mdnMap;
-  unsigned mdnNext;
+  unsigned mdnNext = 0;
 
   /// asMap - The slot map for attribute sets.
   DenseMap<AttributeSet, unsigned> asMap;
-  unsigned asNext;
+  unsigned asNext = 0;
+
 public:
   /// Construct from a module.
   ///
@@ -616,6 +659,7 @@ public:
   /// within a function (even if no functions have been initialized).
   explicit SlotTracker(const Module *M,
                        bool ShouldInitializeAllMetadata = false);
+
   /// Construct from a function, starting out in incorp state.
   ///
   /// If \c ShouldInitializeAllMetadata, initializes all metadata in all
@@ -624,6 +668,9 @@ public:
   explicit SlotTracker(const Function *F,
                        bool ShouldInitializeAllMetadata = false);
 
+  SlotTracker(const SlotTracker &) = delete;
+  SlotTracker &operator=(const SlotTracker &) = delete;
+
   /// Return the slot number of the specified value in it's type
   /// plane.  If something is not in the SlotTracker, return -1.
   int getLocalSlot(const Value *V);
@@ -646,14 +693,16 @@ public:
   void purgeFunction();
 
   /// MDNode map iterators.
-  typedef DenseMap<const MDNode*, unsigned>::iterator mdn_iterator;
+  using mdn_iterator = DenseMap<const MDNode*, unsigned>::iterator;
+
   mdn_iterator mdn_begin() { return mdnMap.begin(); }
   mdn_iterator mdn_end() { return mdnMap.end(); }
   unsigned mdn_size() const { return mdnMap.size(); }
   bool mdn_empty() const { return mdnMap.empty(); }
 
   /// AttributeSet map iterators.
-  typedef DenseMap<AttributeSet, unsigned>::iterator as_iterator;
+  using as_iterator = DenseMap<AttributeSet, unsigned>::iterator;
+
   as_iterator as_begin()   { return asMap.begin(); }
   as_iterator as_end()     { return asMap.end(); }
   unsigned as_size() const { return asMap.size(); }
@@ -691,11 +740,9 @@ private:
 
   /// Add all of the metadata from an instruction.
   void processInstructionMetadata(const Instruction &I);
-
-  SlotTracker(const SlotTracker &) = delete;
-  void operator=(const SlotTracker &) = delete;
 };
-} // namespace llvm
+
+} // end namespace llvm
 
 ModuleSlotTracker::ModuleSlotTracker(SlotTracker &Machine, const Module *M,
                                      const Function *F)
@@ -706,7 +753,7 @@ ModuleSlotTracker::ModuleSlotTracker(const Module *M,
     : ShouldCreateStorage(M),
       ShouldInitializeAllMetadata(ShouldInitializeAllMetadata), M(M) {}
 
-ModuleSlotTracker::~ModuleSlotTracker() {}
+ModuleSlotTracker::~ModuleSlotTracker() = default;
 
 SlotTracker *ModuleSlotTracker::getMachine() {
   if (!ShouldCreateStorage)
@@ -773,17 +820,13 @@ static SlotTracker *createSlotTracker(const Value *V) {
 // Module level constructor. Causes the contents of the Module (sans functions)
 // to be added to the slot table.
 SlotTracker::SlotTracker(const Module *M, bool ShouldInitializeAllMetadata)
-    : TheModule(M), TheFunction(nullptr), FunctionProcessed(false),
-      ShouldInitializeAllMetadata(ShouldInitializeAllMetadata), mNext(0),
-      fNext(0), mdnNext(0), asNext(0) {}
+    : TheModule(M), ShouldInitializeAllMetadata(ShouldInitializeAllMetadata) {}
 
 // Function level constructor. Causes the contents of the Module and the one
 // function provided to be added to the slot table.
 SlotTracker::SlotTracker(const Function *F, bool ShouldInitializeAllMetadata)
     : TheModule(F ? F->getParent() : nullptr), TheFunction(F),
-      FunctionProcessed(false),
-      ShouldInitializeAllMetadata(ShouldInitializeAllMetadata), mNext(0),
-      fNext(0), mdnNext(0), asNext(0) {}
+      ShouldInitializeAllMetadata(ShouldInitializeAllMetadata) {}
 
 inline void SlotTracker::initialize() {
   if (TheModule) {
@@ -949,7 +992,6 @@ int SlotTracker::getMetadataSlot(const MDNode *N) {
   return MI == mdnMap.end() ? -1 : (int)MI->second;
 }
 
-
 /// getLocalSlot - Get the slot number for a value that is local to a function.
 int SlotTracker::getLocalSlot(const Value *V) {
   assert(!isa<Constant>(V) && "Can't get a constant or global slot with this!");
@@ -1248,7 +1290,6 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
-
   if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
     if (CS->getType()->isPacked())
       Out << '<';
@@ -1381,11 +1422,14 @@ static void writeMDTuple(raw_ostream &Out, const MDTuple *Node,
 }
 
 namespace {
+
 struct FieldSeparator {
-  bool Skip;
+  bool Skip = true;
   const char *Sep;
-  FieldSeparator(const char *Sep = ", ") : Skip(true), Sep(Sep) {}
+
+  FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
 };
+
 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
   if (FS.Skip) {
     FS.Skip = false;
@@ -1393,19 +1437,20 @@ raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
   }
   return OS << FS.Sep;
 }
+
 struct MDFieldPrinter {
   raw_ostream &Out;
   FieldSeparator FS;
-  TypePrinting *TypePrinter;
-  SlotTracker *Machine;
-  const Module *Context;
+  TypePrinting *TypePrinter = nullptr;
+  SlotTracker *Machine = nullptr;
+  const Module *Context = nullptr;
 
-  explicit MDFieldPrinter(raw_ostream &Out)
-      : Out(Out), TypePrinter(nullptr), Machine(nullptr), Context(nullptr) {}
+  explicit MDFieldPrinter(raw_ostream &Out) : Out(Out) {}
   MDFieldPrinter(raw_ostream &Out, TypePrinting *TypePrinter,
                  SlotTracker *Machine, const Module *Context)
       : Out(Out), TypePrinter(TypePrinter), Machine(Machine), Context(Context) {
   }
+
   void printTag(const DINode *N);
   void printMacinfoType(const DIMacroNode *N);
   void printChecksumKind(const DIFile *N);
@@ -1422,7 +1467,8 @@ struct MDFieldPrinter {
                       bool ShouldSkipZero = true);
   void printEmissionKind(StringRef Name, DICompileUnit::DebugEmissionKind EK);
 };
-} // end namespace
+
+} // end anonymous namespace
 
 void MDFieldPrinter::printTag(const DINode *N) {
   Out << FS << "tag: ";
@@ -1518,7 +1564,6 @@ void MDFieldPrinter::printEmissionKind(StringRef Name,
   Out << FS << Name << ": " << DICompileUnit::EmissionKindString(EK);
 }
 
-
 template <class IntTy, class Stringifier>
 void MDFieldPrinter::printDwarfEnum(StringRef Name, IntTy Value,
                                     Stringifier toString, bool ShouldSkipZero) {
@@ -1923,7 +1968,6 @@ static void writeDIImportedEntity(raw_ostream &Out, const DIImportedEntity *N,
   Out << ")";
 }
 
-
 static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
                                     TypePrinting *TypePrinter,
                                     SlotTracker *Machine,
@@ -2062,6 +2106,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
 }
 
 namespace {
+
 class AssemblyWriter {
   formatted_raw_ostream &Out;
   const Module *TheModule;
@@ -2125,7 +2170,8 @@ private:
   // intrinsic indicating base and derived pointer names.
   void printGCRelocateComment(const GCRelocateInst &Relocate);
 };
-} // namespace
+
+} // end anonymous namespace
 
 AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
                                const Module *M, AssemblyAnnotationWriter *AAW,
@@ -2594,7 +2640,6 @@ void AssemblyWriter::printTypeIdentities() {
 }
 
 /// printFunction - Print all aspects of a function.
-///
 void AssemblyWriter::printFunction(const Function *F) {
   // Print out the return type and name.
   Out << '\n';
@@ -2730,7 +2775,6 @@ void AssemblyWriter::printFunction(const Function *F) {
 
 /// printArgument - This member is called for every argument that is passed into
 /// the function.  Simply print it out
-///
 void AssemblyWriter::printArgument(const Argument *Arg, AttributeSet Attrs) {
   // Output type...
   TypePrinter.print(Arg->getType(), Out);
@@ -2747,7 +2791,6 @@ void AssemblyWriter::printArgument(const Argument *Arg, AttributeSet Attrs) {
 }
 
 /// printBasicBlock - This member is called for each basic block in a method.
-///
 void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
   if (BB->hasName()) {              // Print out the label if it exists...
     Out << "\n";
@@ -2813,7 +2856,6 @@ void AssemblyWriter::printGCRelocateComment(const GCRelocateInst &Relocate) {
 
 /// printInfoComment - Print a little comment after the instruction indicating
 /// which slot it occupies.
-///
 void AssemblyWriter::printInfoComment(const Value &V) {
   if (const auto *Relocate = dyn_cast<GCRelocateInst>(&V))
     printGCRelocateComment(*Relocate);
@@ -3046,7 +3088,6 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
 
     writeOperandBundles(CI);
-
   } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
     Operand = II->getCalledValue();
     FunctionType *FTy = II->getFunctionType();
@@ -3087,7 +3128,6 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     writeOperand(II->getNormalDest(), true);
     Out << " unwind ";
     writeOperand(II->getUnwindDest(), true);
-
   } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
     Out << ' ';
     if (AI->isUsedWithInAlloca())
@@ -3113,7 +3153,6 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     if (AddrSpace != 0) {
       Out << ", addrspace(" << AddrSpace << ')';
     }
-
   } else if (isa<CastInst>(I)) {
     if (Operand) {
       Out << ' ';
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index 4ed7b021883de..9c7b61f679236 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -100,6 +100,7 @@ public:
 
 class EnumAttributeImpl : public AttributeImpl {
   virtual void anchor();
+
   Attribute::AttrKind Kind;
 
 protected:
@@ -133,6 +134,7 @@ public:
 
 class StringAttributeImpl : public AttributeImpl {
   virtual void anchor();
+
   std::string Kind;
   std::string Val;
 
@@ -243,7 +245,8 @@ public:
     return AvailableFunctionAttrs & ((uint64_t)1) << Kind;
   }
 
-  typedef const AttributeSet *iterator;
+  using iterator = const AttributeSet *;
+
   iterator begin() const { return getTrailingObjects<AttributeSet>(); }
   iterator end() const { return begin() + NumAttrSets; }
 
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index a518f7b5c81a8..8f2e641d64b92 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -1638,6 +1638,39 @@ static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
     Caller.addFnAttr(Attribute::StackProtect);
 }
 
+/// \brief If the inlined function required stack probes, then ensure that
+/// the calling function has those too.
+static void adjustCallerStackProbes(Function &Caller, const Function &Callee) {
+  if (!Caller.hasFnAttribute("probe-stack") &&
+      Callee.hasFnAttribute("probe-stack")) {
+    Caller.addFnAttr(Callee.getFnAttribute("probe-stack"));
+  }
+}
+
+/// \brief If the inlined function defines the size of guard region
+/// on the stack, then ensure that the calling function defines a guard region
+/// that is no larger.
+static void
+adjustCallerStackProbeSize(Function &Caller, const Function &Callee) {
+  if (Callee.hasFnAttribute("stack-probe-size")) {
+    uint64_t CalleeStackProbeSize;
+    Callee.getFnAttribute("stack-probe-size")
+          .getValueAsString()
+          .getAsInteger(0, CalleeStackProbeSize);
+    if (Caller.hasFnAttribute("stack-probe-size")) {
+      uint64_t CallerStackProbeSize;
+      Caller.getFnAttribute("stack-probe-size")
+            .getValueAsString()
+            .getAsInteger(0, CallerStackProbeSize);
+      if (CallerStackProbeSize > CalleeStackProbeSize) {
+        Caller.addFnAttr(Callee.getFnAttribute("stack-probe-size"));
+      }
+    } else {
+      Caller.addFnAttr(Callee.getFnAttribute("stack-probe-size"));
+    }
+  }
+}
+
 #define GET_ATTR_COMPAT_FUNC
 #include "AttributesCompatFunc.inc"
 
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 06934b365a11b..6a4b8032ffd54 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -142,6 +142,11 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("avx512.mask.packssdw.") || // Added in 5.0
       Name.startswith("avx512.mask.packuswb.") || // Added in 5.0
       Name.startswith("avx512.mask.packusdw.") || // Added in 5.0
+      Name.startswith("avx512.mask.cmp.b") || // Added in 5.0
+      Name.startswith("avx512.mask.cmp.d") || // Added in 5.0
+      Name.startswith("avx512.mask.cmp.q") || // Added in 5.0
+      Name.startswith("avx512.mask.cmp.w") || // Added in 5.0
+      Name.startswith("avx512.mask.ucmp.") || // Added in 5.0
       Name == "avx512.mask.add.pd.128" || // Added in 4.0
       Name == "avx512.mask.add.pd.256" || // Added in 4.0
       Name == "avx512.mask.add.ps.128" || // Added in 4.0
@@ -783,12 +788,30 @@ static Value *upgradeIntMinMax(IRBuilder<> &Builder, CallInst &CI,
 }
 
 static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI,
-                                   ICmpInst::Predicate Pred) {
+                                   unsigned CC, bool Signed) {
   Value *Op0 = CI.getArgOperand(0);
   unsigned NumElts = Op0->getType()->getVectorNumElements();
-  Value *Cmp = Builder.CreateICmp(Pred, Op0, CI.getArgOperand(1));
 
-  Value *Mask = CI.getArgOperand(2);
+  Value *Cmp;
+  if (CC == 3) {
+    Cmp = Constant::getNullValue(llvm::VectorType::get(Builder.getInt1Ty(), NumElts));
+  } else if (CC == 7) {
+    Cmp = Constant::getAllOnesValue(llvm::VectorType::get(Builder.getInt1Ty(), NumElts));
+  } else {
+    ICmpInst::Predicate Pred;
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case 0: Pred = ICmpInst::ICMP_EQ;  break;
+    case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
+    case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
+    case 4: Pred = ICmpInst::ICMP_NE;  break;
+    case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
+    case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
+    }
+    Cmp = Builder.CreateICmp(Pred, Op0, CI.getArgOperand(1));
+  }
+
+  Value *Mask = CI.getArgOperand(CI.getNumArgOperands() - 1);
   const auto *C = dyn_cast<Constant>(Mask);
   if (!C || !C->isAllOnesValue())
     Cmp = Builder.CreateAnd(Cmp, getX86MaskVec(Builder, Mask, NumElts));
@@ -1007,9 +1030,13 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     } else if (IsX86 && Name.startswith("avx512.mask.pcmp")) {
       // "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt."
       bool CmpEq = Name[16] == 'e';
-      Rep = upgradeMaskedCompare(Builder, *CI,
-                                 CmpEq ? ICmpInst::ICMP_EQ
-                                       : ICmpInst::ICMP_SGT);
+      Rep = upgradeMaskedCompare(Builder, *CI, CmpEq ? 0 : 6, true);
+    } else if (IsX86 && Name.startswith("avx512.mask.cmp")) {
+      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+      Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
+    } else if (IsX86 && Name.startswith("avx512.mask.ucmp")) {
+      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+      Rep = upgradeMaskedCompare(Builder, *CI, Imm, false);
     } else if (IsX86 && (Name == "sse41.pmaxsb" ||
                          Name == "sse2.pmaxs.w" ||
                          Name == "sse41.pmaxsd" ||
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 1f8659d4e2cae..2b780adf6c69c 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -355,6 +355,19 @@ bool BasicBlock::canSplitPredecessors() const {
   return true;
 }
 
+bool BasicBlock::isLegalToHoistInto() const {
+  auto *Term = getTerminator();
+  // No terminator means the block is under construction.
+  if (!Term)
+    return true;
+
+  // If the block has no successors, there can be no instructions to hoist.
+  assert(Term->getNumSuccessors() > 0);
+
+  // Instructions should not be hoisted across exception handling boundaries.
+  return !Term->isExceptional();
+}
+
 /// This splits a basic block into two at the specified
 /// instruction.  Note that all instructions BEFORE the specified iterator stay
 /// as part of the original basic block, an unconditional branch is added to
diff --git a/lib/IR/ConstantRange.cpp b/lib/IR/ConstantRange.cpp
index 21d1996ef8514..4bd17257016d7 100644
--- a/lib/IR/ConstantRange.cpp
+++ b/lib/IR/ConstantRange.cpp
@@ -1,4 +1,4 @@
-//===-- ConstantRange.cpp - ConstantRange implementation ------------------===//
+//===- ConstantRange.cpp - ConstantRange implementation -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,12 +21,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/APInt.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 ConstantRange::ConstantRange(uint32_t BitWidth, bool Full)
@@ -170,7 +179,7 @@ ConstantRange
 ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp,
                                           const ConstantRange &Other,
                                           unsigned NoWrapKind) {
-  typedef OverflowingBinaryOperator OBO;
+  using OBO = OverflowingBinaryOperator;
 
   // Computes the intersection of CR0 and CR1.  It is different from
   // intersectWith in that the ConstantRange returned will only contain elements
@@ -284,27 +293,14 @@ APInt ConstantRange::getUnsignedMin() const {
 }
 
 APInt ConstantRange::getSignedMax() const {
-  if (!isWrappedSet()) {
-    APInt UpperMinusOne = getUpper() - 1;
-    if (getLower().sle(UpperMinusOne))
-      return UpperMinusOne;
-    return APInt::getSignedMaxValue(getBitWidth());
-  }
-  if (getLower().isNegative() == getUpper().isNegative())
+  if (isFullSet() || Lower.sgt(Upper))
     return APInt::getSignedMaxValue(getBitWidth());
   return getUpper() - 1;
 }
 
 APInt ConstantRange::getSignedMin() const {
-  if (!isWrappedSet()) {
-    if (getLower().sle(getUpper() - 1))
-      return getLower();
+  if (isFullSet() || (Lower.sgt(Upper) && !getUpper().isMinSignedValue()))
     return APInt::getSignedMinValue(getBitWidth());
-  }
-  if ((getUpper() - 1).slt(getLower())) {
-    if (!getUpper().isMinSignedValue())
-      return APInt::getSignedMinValue(getBitWidth());
-  }
   return getLower();
 }
 
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 023a0b178a145..a79b00be4ffe8 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -1995,8 +1995,8 @@ BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
                 InsertBefore) {
   Op<0>() = S1;
   Op<1>() = S2;
-  init(iType);
   setName(Name);
+  AssertOK();
 }
 
 BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, 
@@ -2008,17 +2008,17 @@ BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
                 InsertAtEnd) {
   Op<0>() = S1;
   Op<1>() = S2;
-  init(iType);
   setName(Name);
+  AssertOK();
 }
 
-void BinaryOperator::init(BinaryOps iType) {
+void BinaryOperator::AssertOK() {
   Value *LHS = getOperand(0), *RHS = getOperand(1);
   (void)LHS; (void)RHS; // Silence warnings.
   assert(LHS->getType() == RHS->getType() &&
          "Binary operator operand types must match!");
 #ifndef NDEBUG
-  switch (iType) {
+  switch (getOpcode()) {
   case Add: case Sub:
   case Mul:
     assert(getType() == LHS->getType() &&
@@ -2038,8 +2038,7 @@ void BinaryOperator::init(BinaryOps iType) {
   case SDiv: 
     assert(getType() == LHS->getType() &&
            "Arithmetic operation should return same type as operands!");
-    assert((getType()->isIntegerTy() || (getType()->isVectorTy() && 
-            cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+    assert(getType()->isIntOrIntVectorTy() &&
            "Incorrect operand type (not integer) for S/UDIV");
     break;
   case FDiv:
@@ -2052,8 +2051,7 @@ void BinaryOperator::init(BinaryOps iType) {
   case SRem: 
     assert(getType() == LHS->getType() &&
            "Arithmetic operation should return same type as operands!");
-    assert((getType()->isIntegerTy() || (getType()->isVectorTy() && 
-            cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+    assert(getType()->isIntOrIntVectorTy() &&
            "Incorrect operand type (not integer) for S/UREM");
     break;
   case FRem:
@@ -2067,22 +2065,17 @@ void BinaryOperator::init(BinaryOps iType) {
   case AShr:
     assert(getType() == LHS->getType() &&
            "Shift operation should return same type as operands!");
-    assert((getType()->isIntegerTy() ||
-            (getType()->isVectorTy() && 
-             cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+    assert(getType()->isIntOrIntVectorTy() &&
            "Tried to create a shift operation on a non-integral type!");
     break;
   case And: case Or:
   case Xor:
     assert(getType() == LHS->getType() &&
            "Logical operation should return same type as operands!");
-    assert((getType()->isIntegerTy() ||
-            (getType()->isVectorTy() && 
-             cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+    assert(getType()->isIntOrIntVectorTy() &&
            "Tried to create a logical operation on a non-integral type!");
     break;
-  default:
-    break;
+  default: llvm_unreachable("Invalid opcode provided");
   }
 #endif
 }
diff --git a/lib/IR/LLVMContextImpl.cpp b/lib/IR/LLVMContextImpl.cpp
index 4a30d28c39138..c19e1be44fdc7 100644
--- a/lib/IR/LLVMContextImpl.cpp
+++ b/lib/IR/LLVMContextImpl.cpp
@@ -1,4 +1,4 @@
-//===-- LLVMContextImpl.cpp - Implement LLVMContextImpl -------------------===//
+//===- LLVMContextImpl.cpp - Implement LLVMContextImpl --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "LLVMContextImpl.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OptBisect.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/ManagedStatic.h"
-#include <algorithm>
+#include <cassert>
+#include <utility>
+
 using namespace llvm;
 
 LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
-  : TheTrueVal(nullptr), TheFalseVal(nullptr),
-    VoidTy(C, Type::VoidTyID),
+  : VoidTy(C, Type::VoidTyID),
     LabelTy(C, Type::LabelTyID),
     HalfTy(C, Type::HalfTyID),
     FloatTy(C, Type::FloatTyID),
@@ -39,17 +38,7 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
     Int16Ty(C, 16),
     Int32Ty(C, 32),
     Int64Ty(C, 64),
-    Int128Ty(C, 128) {
-  InlineAsmDiagHandler = nullptr;
-  InlineAsmDiagContext = nullptr;
-  DiagnosticHandler = nullptr;
-  DiagnosticContext = nullptr;
-  RespectDiagnosticFilters = false;
-  DiagnosticHotnessRequested = false;
-  YieldCallback = nullptr;
-  YieldOpaqueHandle = nullptr;
-  NamedStructTypesUniqueID = 0;
-}
+    Int128Ty(C, 128) {}
 
 LLVMContextImpl::~LLVMContextImpl() {
   // NOTE: We need to delete the contents of OwnedModules, but Module's dtor
@@ -156,7 +145,6 @@ void LLVMContextImpl::dropTriviallyDeadConstantArrays() {
         C->destroyConstant();
       }
     }
-
   } while (Changed);
 }
 
@@ -165,6 +153,7 @@ void Module::dropTriviallyDeadConstantArrays() {
 }
 
 namespace llvm {
+
 /// \brief Make MDOperand transparent for hashing.
 ///
 /// This overload of an implementation detail of the hashing library makes
@@ -179,7 +168,8 @@ namespace llvm {
 /// does not cause MDOperand to be transparent.  In particular, a bare pointer
 /// doesn't get hashed before it's combined, whereas \a MDOperand would.
 static const Metadata *get_hashable_data(const MDOperand &X) { return X.get(); }
-}
+
+} // end namespace llvm
 
 unsigned MDNodeOpsKey::calculateHash(MDNode *N, unsigned Offset) {
   unsigned Hash = hash_combine_range(N->op_begin() + Offset, N->op_end());
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 4ba974409a4fc..4147f71ad9d2c 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -1,4 +1,4 @@
-//===-- LLVMContextImpl.h - The LLVMContextImpl opaque class ----*- C++ -*-===//
+//===- LLVMContextImpl.h - The LLVMContextImpl opaque class -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,11 +21,16 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Constants.h"
@@ -33,21 +38,26 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/TrackingMDRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/YAMLTraits.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 namespace llvm {
 
-class ConstantInt;
 class ConstantFP;
-class DiagnosticInfoOptimizationRemark;
-class DiagnosticInfoOptimizationRemarkMissed;
-class DiagnosticInfoOptimizationRemarkAnalysis;
-class GCStrategy;
-class LLVMContext;
+class ConstantInt;
 class Type;
 class Value;
+class ValueHandleBase;
 
 struct DenseMapAPIntKeyInfo {
   static inline APInt getEmptyKey() {
@@ -55,14 +65,17 @@ struct DenseMapAPIntKeyInfo {
     V.U.VAL = 0;
     return V;
   }
+
   static inline APInt getTombstoneKey() {
     APInt V(nullptr, 0);
     V.U.VAL = 1;
     return V;
   }
+
   static unsigned getHashValue(const APInt &Key) {
     return static_cast<unsigned>(hash_value(Key));
   }
+
   static bool isEqual(const APInt &LHS, const APInt &RHS) {
     return LHS.getBitWidth() == RHS.getBitWidth() && LHS == RHS;
   }
@@ -71,9 +84,11 @@ struct DenseMapAPIntKeyInfo {
 struct DenseMapAPFloatKeyInfo {
   static inline APFloat getEmptyKey() { return APFloat(APFloat::Bogus(), 1); }
   static inline APFloat getTombstoneKey() { return APFloat(APFloat::Bogus(), 2); }
+
   static unsigned getHashValue(const APFloat &Key) {
     return static_cast<unsigned>(hash_value(Key));
   }
+
   static bool isEqual(const APFloat &LHS, const APFloat &RHS) {
     return LHS.bitwiseIsEqual(RHS);
   }
@@ -83,10 +98,13 @@ struct AnonStructTypeKeyInfo {
   struct KeyTy {
     ArrayRef<Type*> ETypes;
     bool isPacked;
+
     KeyTy(const ArrayRef<Type*>& E, bool P) :
       ETypes(E), isPacked(P) {}
+
     KeyTy(const StructType *ST)
         : ETypes(ST->elements()), isPacked(ST->isPacked()) {}
+
     bool operator==(const KeyTy& that) const {
       if (isPacked != that.isPacked)
         return false;
@@ -98,25 +116,31 @@ struct AnonStructTypeKeyInfo {
       return !this->operator==(that);
     }
   };
+
   static inline StructType* getEmptyKey() {
     return DenseMapInfo<StructType*>::getEmptyKey();
   }
+
   static inline StructType* getTombstoneKey() {
     return DenseMapInfo<StructType*>::getTombstoneKey();
   }
+
   static unsigned getHashValue(const KeyTy& Key) {
     return hash_combine(hash_combine_range(Key.ETypes.begin(),
                                            Key.ETypes.end()),
                         Key.isPacked);
   }
+
   static unsigned getHashValue(const StructType *ST) {
     return getHashValue(KeyTy(ST));
   }
+
   static bool isEqual(const KeyTy& LHS, const StructType *RHS) {
     if (RHS == getEmptyKey() || RHS == getTombstoneKey())
       return false;
     return LHS == KeyTy(RHS);
   }
+
   static bool isEqual(const StructType *LHS, const StructType *RHS) {
     return LHS == RHS;
   }
@@ -127,11 +151,13 @@ struct FunctionTypeKeyInfo {
     const Type *ReturnType;
     ArrayRef<Type*> Params;
     bool isVarArg;
+
     KeyTy(const Type* R, const ArrayRef<Type*>& P, bool V) :
       ReturnType(R), Params(P), isVarArg(V) {}
     KeyTy(const FunctionType *FT)
         : ReturnType(FT->getReturnType()), Params(FT->params()),
           isVarArg(FT->isVarArg()) {}
+
     bool operator==(const KeyTy& that) const {
       if (ReturnType != that.ReturnType)
         return false;
@@ -145,26 +171,32 @@ struct FunctionTypeKeyInfo {
       return !this->operator==(that);
     }
   };
+
   static inline FunctionType* getEmptyKey() {
     return DenseMapInfo<FunctionType*>::getEmptyKey();
   }
+
   static inline FunctionType* getTombstoneKey() {
     return DenseMapInfo<FunctionType*>::getTombstoneKey();
   }
+
   static unsigned getHashValue(const KeyTy& Key) {
     return hash_combine(Key.ReturnType,
                         hash_combine_range(Key.Params.begin(),
                                            Key.Params.end()),
                         Key.isVarArg);
   }
+
   static unsigned getHashValue(const FunctionType *FT) {
     return getHashValue(KeyTy(FT));
   }
+
   static bool isEqual(const KeyTy& LHS, const FunctionType *RHS) {
     if (RHS == getEmptyKey() || RHS == getTombstoneKey())
       return false;
     return LHS == KeyTy(RHS);
   }
+
   static bool isEqual(const FunctionType *LHS, const FunctionType *RHS) {
     return LHS == RHS;
   }
@@ -174,7 +206,6 @@ struct FunctionTypeKeyInfo {
 class MDNodeOpsKey {
   ArrayRef<Metadata *> RawOps;
   ArrayRef<MDOperand> Ops;
-
   unsigned Hash;
 
 protected:
@@ -212,14 +243,15 @@ public:
 };
 
 template <class NodeTy> struct MDNodeKeyImpl;
-template <class NodeTy> struct MDNodeInfo;
 
 /// Configuration point for MDNodeInfo::isEqual().
 template <class NodeTy> struct MDNodeSubsetEqualImpl {
-  typedef MDNodeKeyImpl<NodeTy> KeyTy;
+  using KeyTy = MDNodeKeyImpl<NodeTy>;
+
   static bool isSubsetEqual(const KeyTy &LHS, const NodeTy *RHS) {
     return false;
   }
+
   static bool isSubsetEqual(const NodeTy *LHS, const NodeTy *RHS) {
     return false;
   }
@@ -252,7 +284,6 @@ template <> struct MDNodeKeyImpl<DILocation> {
   MDNodeKeyImpl(unsigned Line, unsigned Column, Metadata *Scope,
                 Metadata *InlinedAt)
       : Line(Line), Column(Column), Scope(Scope), InlinedAt(InlinedAt) {}
-
   MDNodeKeyImpl(const DILocation *L)
       : Line(L->getLine()), Column(L->getColumn()), Scope(L->getRawScope()),
         InlinedAt(L->getRawInlinedAt()) {}
@@ -261,6 +292,7 @@ template <> struct MDNodeKeyImpl<DILocation> {
     return Line == RHS->getLine() && Column == RHS->getColumn() &&
            Scope == RHS->getRawScope() && InlinedAt == RHS->getRawInlinedAt();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Line, Column, Scope, InlinedAt);
   }
@@ -270,6 +302,7 @@ template <> struct MDNodeKeyImpl<DILocation> {
 template <> struct MDNodeKeyImpl<GenericDINode> : MDNodeOpsKey {
   unsigned Tag;
   MDString *Header;
+
   MDNodeKeyImpl(unsigned Tag, MDString *Header, ArrayRef<Metadata *> DwarfOps)
       : MDNodeOpsKey(DwarfOps), Tag(Tag), Header(Header) {}
   MDNodeKeyImpl(const GenericDINode *N)
@@ -299,6 +332,7 @@ template <> struct MDNodeKeyImpl<DISubrange> {
   bool isKeyOf(const DISubrange *RHS) const {
     return Count == RHS->getCount() && LowerBound == RHS->getLowerBound();
   }
+
   unsigned getHashValue() const { return hash_combine(Count, LowerBound); }
 };
 
@@ -313,6 +347,7 @@ template <> struct MDNodeKeyImpl<DIEnumerator> {
   bool isKeyOf(const DIEnumerator *RHS) const {
     return Value == RHS->getValue() && Name == RHS->getRawName();
   }
+
   unsigned getHashValue() const { return hash_combine(Value, Name); }
 };
 
@@ -337,6 +372,7 @@ template <> struct MDNodeKeyImpl<DIBasicType> {
            AlignInBits == RHS->getAlignInBits() &&
            Encoding == RHS->getEncoding();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Tag, Name, SizeInBits, AlignInBits, Encoding);
   }
@@ -384,6 +420,7 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
            Flags == RHS->getFlags() &&
            ExtraData == RHS->getRawExtraData();
   }
+
   unsigned getHashValue() const {
     // If this is a member inside an ODR type, only hash the type and the name.
     // Otherwise the hash will be stronger than
@@ -402,10 +439,12 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
 };
 
 template <> struct MDNodeSubsetEqualImpl<DIDerivedType> {
-  typedef MDNodeKeyImpl<DIDerivedType> KeyTy;
+  using KeyTy = MDNodeKeyImpl<DIDerivedType>;
+
   static bool isSubsetEqual(const KeyTy &LHS, const DIDerivedType *RHS) {
     return isODRMember(LHS.Tag, LHS.Scope, LHS.Name, RHS);
   }
+
   static bool isSubsetEqual(const DIDerivedType *LHS, const DIDerivedType *RHS) {
     return isODRMember(LHS->getTag(), LHS->getRawScope(), LHS->getRawName(),
                        RHS);
@@ -480,6 +519,7 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
            TemplateParams == RHS->getRawTemplateParams() &&
            Identifier == RHS->getRawIdentifier();
   }
+
   unsigned getHashValue() const {
     // Intentionally computes the hash on a subset of the operands for
     // performance reason. The subset has to be significant enough to avoid
@@ -504,6 +544,7 @@ template <> struct MDNodeKeyImpl<DISubroutineType> {
     return Flags == RHS->getFlags() && CC == RHS->getCC() &&
            TypeArray == RHS->getRawTypeArray();
   }
+
   unsigned getHashValue() const { return hash_combine(Flags, CC, TypeArray); }
 };
 
@@ -527,6 +568,7 @@ template <> struct MDNodeKeyImpl<DIFile> {
            CSKind == RHS->getChecksumKind() &&
            Checksum == RHS->getRawChecksum();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Filename, Directory, CSKind, Checksum);
   }
@@ -601,6 +643,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
            Variables == RHS->getRawVariables() &&
            ThrownTypes == RHS->getRawThrownTypes();
   }
+
   unsigned getHashValue() const {
     // If this is a declaration inside an ODR type, only hash the type and the
     // name.  Otherwise the hash will be stronger than
@@ -619,11 +662,13 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
 };
 
 template <> struct MDNodeSubsetEqualImpl<DISubprogram> {
-  typedef MDNodeKeyImpl<DISubprogram> KeyTy;
+  using KeyTy = MDNodeKeyImpl<DISubprogram>;
+
   static bool isSubsetEqual(const KeyTy &LHS, const DISubprogram *RHS) {
     return isDeclarationOfODRMember(LHS.IsDefinition, LHS.Scope,
                                     LHS.LinkageName, LHS.TemplateParams, RHS);
   }
+
   static bool isSubsetEqual(const DISubprogram *LHS, const DISubprogram *RHS) {
     return isDeclarationOfODRMember(LHS->isDefinition(), LHS->getRawScope(),
                                     LHS->getRawLinkageName(),
@@ -672,6 +717,7 @@ template <> struct MDNodeKeyImpl<DILexicalBlock> {
     return Scope == RHS->getRawScope() && File == RHS->getRawFile() &&
            Line == RHS->getLine() && Column == RHS->getColumn();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Scope, File, Line, Column);
   }
@@ -692,6 +738,7 @@ template <> struct MDNodeKeyImpl<DILexicalBlockFile> {
     return Scope == RHS->getRawScope() && File == RHS->getRawFile() &&
            Discriminator == RHS->getDiscriminator();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Scope, File, Discriminator);
   }
@@ -712,6 +759,7 @@ template <> struct MDNodeKeyImpl<DINamespace> {
     return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
            ExportSymbols == RHS->getExportSymbols();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Scope, Name);
   }
@@ -723,6 +771,7 @@ template <> struct MDNodeKeyImpl<DIModule> {
   MDString *ConfigurationMacros;
   MDString *IncludePath;
   MDString *ISysRoot;
+
   MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *ConfigurationMacros,
                 MDString *IncludePath, MDString *ISysRoot)
       : Scope(Scope), Name(Name), ConfigurationMacros(ConfigurationMacros),
@@ -738,6 +787,7 @@ template <> struct MDNodeKeyImpl<DIModule> {
            IncludePath == RHS->getRawIncludePath() &&
            ISysRoot == RHS->getRawISysRoot();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Scope, Name,
                         ConfigurationMacros, IncludePath, ISysRoot);
@@ -755,6 +805,7 @@ template <> struct MDNodeKeyImpl<DITemplateTypeParameter> {
   bool isKeyOf(const DITemplateTypeParameter *RHS) const {
     return Name == RHS->getRawName() && Type == RHS->getRawType();
   }
+
   unsigned getHashValue() const { return hash_combine(Name, Type); }
 };
 
@@ -774,6 +825,7 @@ template <> struct MDNodeKeyImpl<DITemplateValueParameter> {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
            Type == RHS->getRawType() && Value == RHS->getValue();
   }
+
   unsigned getHashValue() const { return hash_combine(Tag, Name, Type, Value); }
 };
 
@@ -816,6 +868,7 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
                RHS->getRawStaticDataMemberDeclaration() &&
            AlignInBits == RHS->getAlignInBits();
   }
+
   unsigned getHashValue() const {
     // We do not use AlignInBits in hashing function here on purpose:
     // in most cases this param for local variable is zero (for function param
@@ -856,6 +909,7 @@ template <> struct MDNodeKeyImpl<DILocalVariable> {
            Type == RHS->getRawType() && Arg == RHS->getArg() &&
            Flags == RHS->getFlags() && AlignInBits == RHS->getAlignInBits();
   }
+
   unsigned getHashValue() const {
     // We do not use AlignInBits in hashing function here on purpose:
     // in most cases this param for local variable is zero (for function param
@@ -877,6 +931,7 @@ template <> struct MDNodeKeyImpl<DIExpression> {
   bool isKeyOf(const DIExpression *RHS) const {
     return Elements == RHS->getElements();
   }
+
   unsigned getHashValue() const {
     return hash_combine_range(Elements.begin(), Elements.end());
   }
@@ -895,6 +950,7 @@ template <> struct MDNodeKeyImpl<DIGlobalVariableExpression> {
     return Variable == RHS->getRawVariable() &&
            Expression == RHS->getRawExpression();
   }
+
   unsigned getHashValue() const { return hash_combine(Variable, Expression); }
 };
 
@@ -923,6 +979,7 @@ template <> struct MDNodeKeyImpl<DIObjCProperty> {
            SetterName == RHS->getRawSetterName() &&
            Attributes == RHS->getAttributes() && Type == RHS->getRawType();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Name, File, Line, GetterName, SetterName, Attributes,
                         Type);
@@ -948,6 +1005,7 @@ template <> struct MDNodeKeyImpl<DIImportedEntity> {
            Entity == RHS->getRawEntity() && Line == RHS->getLine() &&
            Name == RHS->getRawName();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Tag, Scope, Entity, Line, Name);
   }
@@ -969,6 +1027,7 @@ template <> struct MDNodeKeyImpl<DIMacro> {
     return MIType == RHS->getMacinfoType() && Line == RHS->getLine() &&
            Name == RHS->getRawName() && Value == RHS->getRawValue();
   }
+
   unsigned getHashValue() const {
     return hash_combine(MIType, Line, Name, Value);
   }
@@ -991,6 +1050,7 @@ template <> struct MDNodeKeyImpl<DIMacroFile> {
     return MIType == RHS->getMacinfoType() && Line == RHS->getLine() &&
            File == RHS->getRawFile() && Elements == RHS->getRawElements();
   }
+
   unsigned getHashValue() const {
     return hash_combine(MIType, Line, File, Elements);
   }
@@ -998,23 +1058,29 @@ template <> struct MDNodeKeyImpl<DIMacroFile> {
 
 /// \brief DenseMapInfo for MDNode subclasses.
 template <class NodeTy> struct MDNodeInfo {
-  typedef MDNodeKeyImpl<NodeTy> KeyTy;
-  typedef MDNodeSubsetEqualImpl<NodeTy> SubsetEqualTy;
+  using KeyTy = MDNodeKeyImpl<NodeTy>;
+  using SubsetEqualTy = MDNodeSubsetEqualImpl<NodeTy>;
+
   static inline NodeTy *getEmptyKey() {
     return DenseMapInfo<NodeTy *>::getEmptyKey();
   }
+
   static inline NodeTy *getTombstoneKey() {
     return DenseMapInfo<NodeTy *>::getTombstoneKey();
   }
+
   static unsigned getHashValue(const KeyTy &Key) { return Key.getHashValue(); }
+
   static unsigned getHashValue(const NodeTy *N) {
     return KeyTy(N).getHashValue();
   }
+
   static bool isEqual(const KeyTy &LHS, const NodeTy *RHS) {
     if (RHS == getEmptyKey() || RHS == getTombstoneKey())
       return false;
     return SubsetEqualTy::isSubsetEqual(LHS, RHS) || LHS.isKeyOf(RHS);
   }
+
   static bool isEqual(const NodeTy *LHS, const NodeTy *RHS) {
     if (LHS == RHS)
       return true;
@@ -1024,7 +1090,7 @@ template <class NodeTy> struct MDNodeInfo {
   }
 };
 
-#define HANDLE_MDNODE_LEAF(CLASS) typedef MDNodeInfo<CLASS> CLASS##Info;
+#define HANDLE_MDNODE_LEAF(CLASS) using CLASS##Info = MDNodeInfo<CLASS>;
 #include "llvm/IR/Metadata.def"
 
 /// \brief Map-like storage for metadata attachments.
@@ -1097,24 +1163,24 @@ public:
   /// will be automatically deleted if this context is deleted.
   SmallPtrSet<Module*, 4> OwnedModules;
   
-  LLVMContext::InlineAsmDiagHandlerTy InlineAsmDiagHandler;
-  void *InlineAsmDiagContext;
+  LLVMContext::InlineAsmDiagHandlerTy InlineAsmDiagHandler = nullptr;
+  void *InlineAsmDiagContext = nullptr;
 
-  LLVMContext::DiagnosticHandlerTy DiagnosticHandler;
-  void *DiagnosticContext;
-  bool RespectDiagnosticFilters;
-  bool DiagnosticHotnessRequested;
+  LLVMContext::DiagnosticHandlerTy DiagnosticHandler = nullptr;
+  void *DiagnosticContext = nullptr;
+  bool RespectDiagnosticFilters = false;
+  bool DiagnosticHotnessRequested = false;
   std::unique_ptr<yaml::Output> DiagnosticsOutputFile;
 
-  LLVMContext::YieldCallbackTy YieldCallback;
-  void *YieldOpaqueHandle;
+  LLVMContext::YieldCallbackTy YieldCallback = nullptr;
+  void *YieldOpaqueHandle = nullptr;
 
-  typedef DenseMap<APInt, std::unique_ptr<ConstantInt>, DenseMapAPIntKeyInfo>
-      IntMapTy;
+  using IntMapTy =
+      DenseMap<APInt, std::unique_ptr<ConstantInt>, DenseMapAPIntKeyInfo>;
   IntMapTy IntConstants;
 
-  typedef DenseMap<APFloat, std::unique_ptr<ConstantFP>, DenseMapAPFloatKeyInfo>
-      FPMapTy;
+  using FPMapTy =
+      DenseMap<APFloat, std::unique_ptr<ConstantFP>, DenseMapAPFloatKeyInfo>;
   FPMapTy FPConstants;
 
   FoldingSet<AttributeImpl> AttrsSet;
@@ -1142,13 +1208,13 @@ public:
 
   DenseMap<Type *, std::unique_ptr<ConstantAggregateZero>> CAZConstants;
 
-  typedef ConstantUniqueMap<ConstantArray> ArrayConstantsTy;
+  using ArrayConstantsTy = ConstantUniqueMap<ConstantArray>;
   ArrayConstantsTy ArrayConstants;
   
-  typedef ConstantUniqueMap<ConstantStruct> StructConstantsTy;
+  using StructConstantsTy = ConstantUniqueMap<ConstantStruct>;
   StructConstantsTy StructConstants;
   
-  typedef ConstantUniqueMap<ConstantVector> VectorConstantsTy;
+  using VectorConstantsTy = ConstantUniqueMap<ConstantVector>;
   VectorConstantsTy VectorConstants;
 
   DenseMap<PointerType *, std::unique_ptr<ConstantPointerNull>> CPNConstants;
@@ -1163,8 +1229,8 @@ public:
 
   ConstantUniqueMap<InlineAsm> InlineAsms;
 
-  ConstantInt *TheTrueVal;
-  ConstantInt *TheFalseVal;
+  ConstantInt *TheTrueVal = nullptr;
+  ConstantInt *TheFalseVal = nullptr;
 
   std::unique_ptr<ConstantTokenNone> TheNoneToken;
 
@@ -1172,7 +1238,6 @@ public:
   Type VoidTy, LabelTy, HalfTy, FloatTy, DoubleTy, MetadataTy, TokenTy;
   Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy;
   IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty;
-
   
   /// TypeAllocator - All dynamically allocated types are allocated from this.
   /// They live forever until the context is torn down.
@@ -1180,23 +1245,22 @@ public:
   
   DenseMap<unsigned, IntegerType*> IntegerTypes;
 
-  typedef DenseSet<FunctionType *, FunctionTypeKeyInfo> FunctionTypeSet;
+  using FunctionTypeSet = DenseSet<FunctionType *, FunctionTypeKeyInfo>;
   FunctionTypeSet FunctionTypes;
-  typedef DenseSet<StructType *, AnonStructTypeKeyInfo> StructTypeSet;
+  using StructTypeSet = DenseSet<StructType *, AnonStructTypeKeyInfo>;
   StructTypeSet AnonStructTypes;
   StringMap<StructType*> NamedStructTypes;
-  unsigned NamedStructTypesUniqueID;
+  unsigned NamedStructTypesUniqueID = 0;
     
   DenseMap<std::pair<Type *, uint64_t>, ArrayType*> ArrayTypes;
   DenseMap<std::pair<Type *, unsigned>, VectorType*> VectorTypes;
   DenseMap<Type*, PointerType*> PointerTypes;  // Pointers in AddrSpace = 0
   DenseMap<std::pair<Type*, unsigned>, PointerType*> ASPointerTypes;
 
-
   /// ValueHandles - This map keeps track of all of the value handles that are
   /// watching a Value*.  The Value::HasValueHandle bit is used to know
   /// whether or not a value has an entry in this map.
-  typedef DenseMap<Value*, ValueHandleBase*> ValueHandlesTy;
+  using ValueHandlesTy = DenseMap<Value *, ValueHandleBase *>;
   ValueHandlesTy ValueHandles;
   
   /// CustomMDKindNames - Map to hold the metadata string to ID mapping.
@@ -1254,6 +1318,6 @@ public:
   OptBisect &getOptBisect();
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_IR_LLVMCONTEXTIMPL_H
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index 92e5798dcf214..ac02ff76c8436 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Metadata.h"
 #include "LLVMContextImpl.h"
 #include "MetadataImpl.h"
 #include "SymbolTableListTraitsImpl.h"
@@ -27,6 +26,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -39,6 +39,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/TrackingMDRef.h"
 #include "llvm/IR/Type.h"
@@ -53,6 +54,7 @@
 #include <cstdint>
 #include <iterator>
 #include <tuple>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -233,7 +235,7 @@ void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) {
     return;
 
   // Copy out uses since UseMap will get touched below.
-  typedef std::pair<void *, std::pair<OwnerTy, uint64_t>> UseTy;
+  using UseTy = std::pair<void *, std::pair<OwnerTy, uint64_t>>;
   SmallVector<UseTy, 8> Uses(UseMap.begin(), UseMap.end());
   std::sort(Uses.begin(), Uses.end(), [](const UseTy &L, const UseTy &R) {
     return L.second.second < R.second.second;
@@ -286,7 +288,7 @@ void ReplaceableMetadataImpl::resolveAllUses(bool ResolveUsers) {
   }
 
   // Copy out uses since UseMap could get touched below.
-  typedef std::pair<void *, std::pair<OwnerTy, uint64_t>> UseTy;
+  using UseTy = std::pair<void *, std::pair<OwnerTy, uint64_t>>;
   SmallVector<UseTy, 8> Uses(UseMap.begin(), UseMap.end());
   std::sort(Uses.begin(), Uses.end(), [](const UseTy &L, const UseTy &R) {
     return L.second.second < R.second.second;
@@ -758,8 +760,8 @@ static T *uniquifyImpl(T *N, DenseSet<T *, InfoT> &Store) {
 }
 
 template <class NodeTy> struct MDNode::HasCachedHash {
-  typedef char Yes[1];
-  typedef char No[2];
+  using Yes = char[1];
+  using No = char[2];
   template <class U, U Val> struct SFINAE {};
 
   template <class U>
@@ -1484,7 +1486,7 @@ void GlobalObject::addTypeMetadata(unsigned Offset, Metadata *TypeID) {
   addMetadata(
       LLVMContext::MD_type,
       *MDTuple::get(getContext(),
-                    {ConstantAsMetadata::get(llvm::ConstantInt::get(
+                    {ConstantAsMetadata::get(ConstantInt::get(
                          Type::getInt64Ty(getContext()), Offset)),
                      TypeID}));
 }
diff --git a/lib/IR/Statepoint.cpp b/lib/IR/Statepoint.cpp
index 8c3f0f208cc67..18efee2177c34 100644
--- a/lib/IR/Statepoint.cpp
+++ b/lib/IR/Statepoint.cpp
@@ -44,10 +44,22 @@ bool llvm::isGCRelocate(ImmutableCallSite CS) {
   return CS.getInstruction() && isa<GCRelocateInst>(CS.getInstruction());
 }
 
+bool llvm::isGCRelocate(const Value *V) {
+  if (auto CS = ImmutableCallSite(V))
+    return isGCRelocate(CS);
+  return false;
+}
+
 bool llvm::isGCResult(ImmutableCallSite CS) {
   return CS.getInstruction() && isa<GCResultInst>(CS.getInstruction());
 }
 
+bool llvm::isGCResult(const Value *V) {
+  if (auto CS = ImmutableCallSite(V))
+    return isGCResult(CS);
+  return false;
+}
+
 bool llvm::isStatepointDirectiveAttr(Attribute Attr) {
   return Attr.hasAttribute("statepoint-id") ||
          Attr.hasAttribute("statepoint-num-patch-bytes");
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index e4094d44867b2..1efd481b246c8 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -32,7 +32,6 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/LTO/LTO.h"
-#include "llvm/Linker/Linker.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Support/CachePruning.h"
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index a86fd383003da..562f136a3ce2b 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -45,13 +45,13 @@ add_llvm_library(LLVMMC
   MCWasmObjectTargetWriter.cpp
   MCWasmStreamer.cpp
   MCWin64EH.cpp
+  MCWinCOFFStreamer.cpp
   MCWinEH.cpp
   MachObjectWriter.cpp
   StringTableBuilder.cpp
   SubtargetFeature.cpp
   WasmObjectWriter.cpp
   WinCOFFObjectWriter.cpp
-  WinCOFFStreamer.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/MC
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 4d139132df462..30f357826805a 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -633,9 +633,6 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
   MCContext &Ctx = Asm.getContext();
 
   if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
-    assert(RefB->getKind() == MCSymbolRefExpr::VK_None &&
-           "Should not have constructed this");
-
     // Let A, B and C being the components of Target and R be the location of
     // the fixup. If the fixup is not pcrel, we want to compute (A - B + C).
     // If it is pcrel, we want to compute (A - B + C - R).
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 53cdaac3aa54b..92c5da0e9fef9 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -193,14 +193,23 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
   // FIXME: This code has some duplication with recordRelocation. We should
   // probably merge the two into a single callback that tries to evaluate a
   // fixup and records a relocation if one is needed.
+
+  // On error claim to have completely evaluated the fixup, to prevent any
+  // further processing from being done.
   const MCExpr *Expr = Fixup.getValue();
+  MCContext &Ctx = getContext();
+  Value = 0;
   if (!Expr->evaluateAsRelocatable(Target, &Layout, &Fixup)) {
-    getContext().reportError(Fixup.getLoc(), "expected relocatable expression");
-    // Claim to have completely evaluated the fixup, to prevent any further
-    // processing from being done.
-    Value = 0;
+    Ctx.reportError(Fixup.getLoc(), "expected relocatable expression");
     return true;
   }
+  if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
+    if (RefB->getKind() != MCSymbolRefExpr::VK_None) {
+      Ctx.reportError(Fixup.getLoc(),
+                      "unsupported subtraction of qualified symbol");
+      return true;
+    }
+  }
 
   bool IsPCRel = Backend.getFixupKindInfo(
     Fixup.getKind()).Flags & MCFixupKindInfo::FKF_IsPCRel;
@@ -254,8 +263,7 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
 
   // Let the backend adjust the fixup value if necessary, including whether
   // we need a relocation.
-  Backend.processFixupValue(*this, Layout, Fixup, DF, Target, Value,
-                            IsResolved);
+  Backend.processFixupValue(*this, Fixup, Target, IsResolved);
 
   return IsResolved;
 }
@@ -639,9 +647,9 @@ void MCAssembler::writeSectionData(const MCSection *Sec,
          Layout.getSectionAddressSize(Sec));
 }
 
-std::pair<uint64_t, bool> MCAssembler::handleFixup(const MCAsmLayout &Layout,
-                                                   MCFragment &F,
-                                                   const MCFixup &Fixup) {
+std::tuple<MCValue, uint64_t, bool>
+MCAssembler::handleFixup(const MCAsmLayout &Layout, MCFragment &F,
+                         const MCFixup &Fixup) {
   // Evaluate the fixup.
   MCValue Target;
   uint64_t FixedValue;
@@ -654,7 +662,7 @@ std::pair<uint64_t, bool> MCAssembler::handleFixup(const MCAsmLayout &Layout,
     getWriter().recordRelocation(*this, Layout, &F, Fixup, Target, IsPCRel,
                                  FixedValue);
   }
-  return std::make_pair(FixedValue, IsPCRel);
+  return std::make_tuple(Target, FixedValue, IsPCRel);
 }
 
 void MCAssembler::layout(MCAsmLayout &Layout) {
@@ -731,9 +739,11 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
       for (const MCFixup &Fixup : Fixups) {
         uint64_t FixedValue;
         bool IsPCRel;
-        std::tie(FixedValue, IsPCRel) = handleFixup(Layout, Frag, Fixup);
-        getBackend().applyFixup(Fixup, Contents.data(), Contents.size(),
-                                FixedValue, IsPCRel, getContext());
+        MCValue Target;
+        std::tie(Target, FixedValue, IsPCRel) =
+            handleFixup(Layout, Frag, Fixup);
+        getBackend().applyFixup(*this, Fixup, Target, Contents, FixedValue,
+                                IsPCRel);
       }
     }
   }
diff --git a/lib/MC/MCFragment.cpp b/lib/MC/MCFragment.cpp
index f3d0eb55eecd4..6e0249377a899 100644
--- a/lib/MC/MCFragment.cpp
+++ b/lib/MC/MCFragment.cpp
@@ -307,7 +307,7 @@ raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) {
 } // end namespace llvm
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void MCFragment::dump() {
+LLVM_DUMP_METHOD void MCFragment::dump() const {
   raw_ostream &OS = errs();
 
   OS << "<";
@@ -328,9 +328,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() {
   case MCFragment::FT_Dummy: OS << "MCDummyFragment"; break;
   }
 
-  OS << "<MCFragment " << (void*) this << " LayoutOrder:" << LayoutOrder
+  OS << "<MCFragment " << (const void*) this << " LayoutOrder:" << LayoutOrder
      << " Offset:" << Offset
-     << " HasInstructions:" << hasInstructions() 
+     << " HasInstructions:" << hasInstructions()
      << " BundlePadding:" << static_cast<unsigned>(getBundlePadding()) << ">";
 
   switch (getKind()) {
@@ -382,7 +382,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() {
   }
   case MCFragment::FT_Fill:  {
     const MCFillFragment *FF = cast<MCFillFragment>(this);
-    OS << " Value:" << FF->getValue() << " Size:" << FF->getSize();
+    OS << " Value:" << static_cast<unsigned>(FF->getValue())
+       << " Size:" << FF->getSize();
     break;
   }
   case MCFragment::FT_Relaxable:  {
@@ -395,7 +396,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() {
   case MCFragment::FT_Org:  {
     const MCOrgFragment *OF = cast<MCOrgFragment>(this);
     OS << "\n       ";
-    OS << " Offset:" << OF->getOffset() << " Value:" << OF->getValue();
+    OS << " Offset:" << OF->getOffset()
+       << " Value:" << static_cast<unsigned>(OF->getValue());
     break;
   }
   case MCFragment::FT_Dwarf:  {
@@ -445,19 +447,19 @@ LLVM_DUMP_METHOD void MCFragment::dump() {
   OS << ">";
 }
 
-LLVM_DUMP_METHOD void MCAssembler::dump() {
+LLVM_DUMP_METHOD void MCAssembler::dump() const{
   raw_ostream &OS = errs();
 
   OS << "<MCAssembler\n";
   OS << "  Sections:[\n    ";
-  for (iterator it = begin(), ie = end(); it != ie; ++it) {
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
     if (it != begin()) OS << ",\n    ";
     it->dump();
   }
   OS << "],\n";
   OS << "  Symbols:[";
 
-  for (symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) {
+  for (const_symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) {
     if (it != symbol_begin()) OS << ",\n           ";
     OS << "(";
     it->dump();
diff --git a/lib/MC/MCSection.cpp b/lib/MC/MCSection.cpp
index b961cb3968e86..d141dd6627c46 100644
--- a/lib/MC/MCSection.cpp
+++ b/lib/MC/MCSection.cpp
@@ -86,7 +86,7 @@ MCSection::getSubsectionInsertionPoint(unsigned Subsection) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void MCSection::dump() {
+LLVM_DUMP_METHOD void MCSection::dump() const {
   raw_ostream &OS = errs();
 
   OS << "<MCSection";
diff --git a/lib/MC/MCWasmStreamer.cpp b/lib/MC/MCWasmStreamer.cpp
index 59b62b8d37c30..02fa070f0c57d 100644
--- a/lib/MC/MCWasmStreamer.cpp
+++ b/lib/MC/MCWasmStreamer.cpp
@@ -98,18 +98,30 @@ bool MCWasmStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   case MCSA_WeakDefAutoPrivate:
   case MCSA_Invalid:
   case MCSA_IndirectSymbol:
+  case MCSA_Hidden:
     return false;
+
+  case MCSA_Weak:
+  case MCSA_WeakReference:
+    Symbol->setWeak(true);
+    Symbol->setExternal(true);
+    break;
+
   case MCSA_Global:
     Symbol->setExternal(true);
     break;
+
   case MCSA_ELF_TypeFunction:
     Symbol->setIsFunction(true);
     break;
+
   case MCSA_ELF_TypeObject:
     Symbol->setIsFunction(false);
     break;
+
   default:
     // unrecognized directive
+    llvm_unreachable("unexpected MCSymbolAttr");
     return false;
   }
 
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/MCWinCOFFStreamer.cpp
index b4d0d7a87f1d3..bf341bb1f4511 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/MCWinCOFFStreamer.cpp
@@ -1,4 +1,4 @@
-//===- llvm/MC/WinCOFFStreamer.cpp ----------------------------------------===//
+//===- llvm/MC/MCWinCOFFStreamer.cpp --------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -190,7 +190,8 @@ void MCWinCOFFStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
                    << COFF::SCT_COMPLEX_TYPE_SHIFT);
 }
 
-void MCWinCOFFStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {
+void MCWinCOFFStreamer::EmitCOFFSectionIndex(const MCSymbol *Symbol) {
+  visitUsedSymbol(*Symbol);
   MCDataFragment *DF = getOrCreateDataFragment();
   const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
   MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, FK_SecRel_2);
@@ -198,8 +199,9 @@ void MCWinCOFFStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {
   DF->getContents().resize(DF->getContents().size() + 2, 0);
 }
 
-void MCWinCOFFStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol,
+void MCWinCOFFStreamer::EmitCOFFSecRel32(const MCSymbol *Symbol,
                                          uint64_t Offset) {
+  visitUsedSymbol(*Symbol);
   MCDataFragment *DF = getOrCreateDataFragment();
   // Create Symbol A for the relocation relative reference.
   const MCExpr *MCE = MCSymbolRefExpr::create(Symbol, getContext());
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index db304c027f991..45534ba182123 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -129,15 +129,15 @@ struct WasmGlobal {
 
 // Information about a single relocation.
 struct WasmRelocationEntry {
-  uint64_t Offset;            // Where is the relocation.
-  const MCSymbolWasm *Symbol; // The symbol to relocate with.
-  int64_t Addend;             // A value to add to the symbol.
-  unsigned Type;              // The type of the relocation.
-  MCSectionWasm *FixupSection;// The section the relocation is targeting.
+  uint64_t Offset;                  // Where is the relocation.
+  const MCSymbolWasm *Symbol;       // The symbol to relocate with.
+  int64_t Addend;                   // A value to add to the symbol.
+  unsigned Type;                    // The type of the relocation.
+  const MCSectionWasm *FixupSection;// The section the relocation is targeting.
 
   WasmRelocationEntry(uint64_t Offset, const MCSymbolWasm *Symbol,
                       int64_t Addend, unsigned Type,
-                      MCSectionWasm *FixupSection)
+                      const MCSectionWasm *FixupSection)
       : Offset(Offset), Symbol(Symbol), Addend(Addend), Type(Type),
         FixupSection(FixupSection) {}
 
@@ -156,9 +156,19 @@ struct WasmRelocationEntry {
     Out << "Off=" << Offset << ", Sym=" << Symbol << ", Addend=" << Addend
         << ", Type=" << Type << ", FixupSection=" << FixupSection;
   }
-  void dump() const { print(errs()); }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+#endif
 };
 
+#if !defined(NDEBUG)
+raw_ostream &operator<<(raw_ostream &OS, const WasmRelocationEntry &Rel) {
+  Rel.print(OS);
+  return OS;
+}
+#endif
+
 class WasmObjectWriter : public MCObjectWriter {
   /// Helper struct for containing some precomputed information on symbols.
   struct WasmSymbolData {
@@ -229,6 +239,11 @@ private:
 
   void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 
+  void writeString(const StringRef Str) {
+    encodeULEB128(Str.size(), getStream());
+    writeBytes(Str);
+  }
+
   void writeValueType(wasm::ValType Ty) {
     encodeSLEB128(int32_t(Ty), getStream());
   }
@@ -250,7 +265,8 @@ private:
                         uint32_t NumFuncImports);
   void writeCodeRelocSection();
   void writeDataRelocSection(uint64_t DataSectionHeaderSize);
-  void writeLinkingMetaDataSection(bool HasStackPointer,
+  void writeLinkingMetaDataSection(ArrayRef<StringRef> WeakSymbols,
+                                   bool HasStackPointer,
                                    uint32_t StackPointerGlobal);
 
   void applyRelocations(ArrayRef<WasmRelocationEntry> Relocations,
@@ -282,6 +298,7 @@ void WasmObjectWriter::startSection(SectionBookkeeping &Section,
   assert((Name != nullptr) == (SectionId == wasm::WASM_SEC_CUSTOM) &&
          "Only custom sections can have names");
 
+  DEBUG(dbgs() << "startSection " << SectionId << ": " << Name << "\n");
   encodeULEB128(SectionId, getStream());
 
   Section.SizeOffset = getStream().tell();
@@ -295,8 +312,8 @@ void WasmObjectWriter::startSection(SectionBookkeeping &Section,
 
   // Custom sections in wasm also have a string identifier.
   if (SectionId == wasm::WASM_SEC_CUSTOM) {
-    encodeULEB128(strlen(Name), getStream());
-    writeBytes(Name);
+    assert(Name);
+    writeString(StringRef(Name));
   }
 }
 
@@ -307,6 +324,7 @@ void WasmObjectWriter::endSection(SectionBookkeeping &Section) {
   if (uint32_t(Size) != Size)
     report_fatal_error("section size does not fit in a uint32_t");
 
+  DEBUG(dbgs() << "endSection size=" << Size << "\n");
   unsigned Padding = PaddingFor5ByteULEB128(Size);
 
   // Write the final section size to the payload_len field, which follows
@@ -332,7 +350,7 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
                                         const MCFragment *Fragment,
                                         const MCFixup &Fixup, MCValue Target,
                                         bool &IsPCRel, uint64_t &FixedValue) {
-  MCSectionWasm &FixupSection = cast<MCSectionWasm>(*Fragment->getParent());
+  const auto &FixupSection = cast<MCSectionWasm>(*Fragment->getParent());
   uint64_t C = Target.getConstant();
   uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
   MCContext &Ctx = Asm.getContext();
@@ -406,9 +424,12 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
   }
 
   assert(!IsPCRel);
+  assert(SymA);
+
   unsigned Type = getRelocType(Target, Fixup);
 
   WasmRelocationEntry Rec(FixupOffset, SymA, C, Type, &FixupSection);
+  DEBUG(dbgs() << "WasmReloc: " << Rec << "\n");
 
   if (FixupSection.hasInstructions())
     CodeRelocations.push_back(Rec);
@@ -453,11 +474,10 @@ static uint32_t ProvisionalValue(const WasmRelocationEntry &RelEntry) {
   const MCSymbolWasm *Sym = RelEntry.Symbol;
 
   // For undefined symbols, use a hopefully invalid value.
-  if (!Sym->isDefined(false))
+  if (!Sym->isDefined(/*SetUsed=*/false))
     return UINT32_MAX;
 
-  MCSectionWasm &Section =
-    cast<MCSectionWasm>(RelEntry.Symbol->getSection(false));
+  const auto &Section = cast<MCSectionWasm>(RelEntry.Symbol->getSection(false));
   uint64_t Address = Section.getSectionOffset() + RelEntry.Addend;
 
   // Ignore overflow. LLVM allows address arithmetic to silently wrap.
@@ -471,16 +491,23 @@ uint32_t WasmObjectWriter::getRelocationIndexValue(
   switch (RelEntry.Type) {
   case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
   case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
-    assert(IndirectSymbolIndices.count(RelEntry.Symbol));
+    if (!IndirectSymbolIndices.count(RelEntry.Symbol))
+      report_fatal_error("symbol not found table index space:" +
+                         RelEntry.Symbol->getName());
     return IndirectSymbolIndices[RelEntry.Symbol];
   case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
+  case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
   case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB:
   case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
   case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
-    assert(SymbolIndices.count(RelEntry.Symbol));
+    if (!SymbolIndices.count(RelEntry.Symbol))
+      report_fatal_error("symbol not found function/global index space:" +
+                         RelEntry.Symbol->getName());
     return SymbolIndices[RelEntry.Symbol];
   case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
-    assert(TypeIndices.count(RelEntry.Symbol));
+    if (!TypeIndices.count(RelEntry.Symbol))
+      report_fatal_error("symbol not found in type index space:" +
+                         RelEntry.Symbol->getName());
     return TypeIndices[RelEntry.Symbol];
   default:
     llvm_unreachable("invalid relocation type");
@@ -497,10 +524,12 @@ void WasmObjectWriter::applyRelocations(
                       RelEntry.FixupSection->getSectionOffset() +
                       RelEntry.Offset;
 
+    DEBUG(dbgs() << "applyRelocation: " << RelEntry << "\n");
     switch (RelEntry.Type) {
     case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
     case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
-    case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB: {
+    case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB: {
       uint32_t Index = getRelocationIndexValue(RelEntry);
       WritePatchableSLEB(Stream, Index, Offset);
       break;
@@ -526,7 +555,7 @@ void WasmObjectWriter::applyRelocations(
       break;
     }
     default:
-      llvm_unreachable("unsupported relocation type");
+      llvm_unreachable("invalid relocation type");
     }
   }
 }
@@ -573,6 +602,7 @@ void WasmObjectWriter::writeTypeSection(
   endSection(Section);
 }
 
+
 void WasmObjectWriter::writeImportSection(
     const SmallVector<WasmImport, 4> &Imports) {
   if (Imports.empty())
@@ -583,13 +613,8 @@ void WasmObjectWriter::writeImportSection(
 
   encodeULEB128(Imports.size(), getStream());
   for (const WasmImport &Import : Imports) {
-    StringRef ModuleName = Import.ModuleName;
-    encodeULEB128(ModuleName.size(), getStream());
-    writeBytes(ModuleName);
-
-    StringRef FieldName = Import.FieldName;
-    encodeULEB128(FieldName.size(), getStream());
-    writeBytes(FieldName);
+    writeString(Import.ModuleName);
+    writeString(Import.FieldName);
 
     encodeULEB128(Import.Kind, getStream());
 
@@ -697,11 +722,8 @@ void WasmObjectWriter::writeExportSection(
 
   encodeULEB128(Exports.size(), getStream());
   for (const WasmExport &Export : Exports) {
-    encodeULEB128(Export.FieldName.size(), getStream());
-    writeBytes(Export.FieldName);
-
+    writeString(Export.FieldName);
     encodeSLEB128(Export.Kind, getStream());
-
     encodeULEB128(Export.Index, getStream());
   }
 
@@ -743,17 +765,7 @@ void WasmObjectWriter::writeCodeSection(
   encodeULEB128(Functions.size(), getStream());
 
   for (const WasmFunction &Func : Functions) {
-    MCSectionWasm &FuncSection =
-        static_cast<MCSectionWasm &>(Func.Sym->getSection());
-
-    if (Func.Sym->isVariable())
-      report_fatal_error("weak symbols not supported yet");
-
-    if (Func.Sym->getOffset() != 0)
-      report_fatal_error("function sections must contain one function each");
-
-    if (!Func.Sym->getSize())
-      report_fatal_error("function symbols must have a size set with .size");
+    auto &FuncSection = static_cast<MCSectionWasm &>(Func.Sym->getSection());
 
     int64_t Size = 0;
     if (!Func.Sym->getSize()->evaluateAsAbsolute(Size, Layout))
@@ -761,8 +773,7 @@ void WasmObjectWriter::writeCodeSection(
 
     encodeULEB128(Size, getStream());
 
-    FuncSection.setSectionOffset(getStream().tell() -
-                                 Section.ContentsOffset);
+    FuncSection.setSectionOffset(getStream().tell() - Section.ContentsOffset);
 
     Asm.writeSectionData(&FuncSection, Layout);
   }
@@ -815,15 +826,13 @@ void WasmObjectWriter::writeNameSection(
   for (const WasmImport &Import : Imports) {
     if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
       encodeULEB128(Index, getStream());
-      encodeULEB128(Import.FieldName.size(), getStream());
-      writeBytes(Import.FieldName);
+      writeString(Import.FieldName);
       ++Index;
     }
   }
   for (const WasmFunction &Func : Functions) {
     encodeULEB128(Index, getStream());
-    encodeULEB128(Func.Sym->getName().size(), getStream());
-    writeBytes(Func.Sym->getName());
+    writeString(Func.Sym->getName());
     ++Index;
   }
 
@@ -868,22 +877,37 @@ void WasmObjectWriter::writeDataRelocSection(uint64_t DataSectionHeaderSize) {
 }
 
 void WasmObjectWriter::writeLinkingMetaDataSection(
-    bool HasStackPointer, uint32_t StackPointerGlobal) {
-  if (!HasStackPointer)
+    ArrayRef<StringRef> WeakSymbols, bool HasStackPointer,
+    uint32_t StackPointerGlobal) {
+  if (!HasStackPointer && WeakSymbols.empty())
     return;
+
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_CUSTOM, "linking");
+  SectionBookkeeping SubSection;
 
-  encodeULEB128(1, getStream()); // count
+  if (HasStackPointer) {
+    startSection(SubSection, wasm::WASM_STACK_POINTER);
+    encodeULEB128(StackPointerGlobal, getStream()); // id
+    endSection(SubSection);
+  }
 
-  encodeULEB128(wasm::WASM_STACK_POINTER, getStream()); // type
-  encodeULEB128(StackPointerGlobal, getStream()); // id
+  if (WeakSymbols.size() != 0) {
+    startSection(SubSection, wasm::WASM_SYMBOL_INFO);
+    encodeULEB128(WeakSymbols.size(), getStream());
+    for (const StringRef Export: WeakSymbols) {
+      writeString(Export);
+      encodeULEB128(wasm::WASM_SYMBOL_FLAG_WEAK, getStream());
+    }
+    endSection(SubSection);
+  }
 
   endSection(Section);
 }
 
 void WasmObjectWriter::writeObject(MCAssembler &Asm,
                                    const MCAsmLayout &Layout) {
+  DEBUG(dbgs() << "WasmObjectWriter::writeObject\n");
   MCContext &Ctx = Asm.getContext();
   wasm::ValType PtrType = is64Bit() ? wasm::ValType::I64 : wasm::ValType::I32;
 
@@ -894,6 +918,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
   SmallVector<WasmGlobal, 4> Globals;
   SmallVector<WasmImport, 4> Imports;
   SmallVector<WasmExport, 4> Exports;
+  SmallVector<StringRef, 4> WeakSymbols;
   SmallPtrSet<const MCSymbolWasm *, 4> IsAddressTaken;
   unsigned NumFuncImports = 0;
   unsigned NumGlobalImports = 0;
@@ -902,7 +927,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
   bool HasStackPointer = false;
 
   // Populate the IsAddressTaken set.
-  for (WasmRelocationEntry RelEntry : CodeRelocations) {
+  for (const WasmRelocationEntry &RelEntry : CodeRelocations) {
     switch (RelEntry.Type) {
     case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
     case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
@@ -912,7 +937,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       break;
     }
   }
-  for (WasmRelocationEntry RelEntry : DataRelocations) {
+  for (const WasmRelocationEntry &RelEntry : DataRelocations) {
     switch (RelEntry.Type) {
     case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
     case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
@@ -975,7 +1000,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
     const MCFragment &Frag = *GlobalVars->begin();
     if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
       report_fatal_error("only data supported in .global_variables");
-    const MCDataFragment &DataFrag = cast<MCDataFragment>(Frag);
+    const auto &DataFrag = cast<MCDataFragment>(Frag);
     if (!DataFrag.getFixups().empty())
       report_fatal_error("fixups not supported in .global_variables");
     const SmallVectorImpl<char> &Contents = DataFrag.getContents();
@@ -1031,7 +1056,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
     const MCFragment &Frag = *StackPtr->begin();
     if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
       report_fatal_error("only data supported in .stack_pointer");
-    const MCDataFragment &DataFrag = cast<MCDataFragment>(Frag);
+    const auto &DataFrag = cast<MCDataFragment>(Frag);
     if (!DataFrag.getFixups().empty())
       report_fatal_error("fixups not supported in .stack_pointer");
     const SmallVectorImpl<char> &Contents = DataFrag.getContents();
@@ -1041,14 +1066,30 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
     StackPointerGlobal = NumGlobalImports + *(const int32_t *)Contents.data();
   }
 
-  // Handle defined symbols.
+  // Handle regular defined and undefined symbols.
   for (const MCSymbol &S : Asm.symbols()) {
     // Ignore unnamed temporary symbols, which aren't ever exported, imported,
     // or used in relocations.
     if (S.isTemporary() && S.getName().empty())
       continue;
+
+    // Variable references (weak references) are handled in a second pass
+    if (S.isVariable())
+      continue;
+
     const auto &WS = static_cast<const MCSymbolWasm &>(S);
+    DEBUG(dbgs() << "MCSymbol: '" << S << "'"
+                 << " isDefined=" << S.isDefined() << " isExternal="
+                 << S.isExternal() << " isTemporary=" << S.isTemporary()
+                 << " isFunction=" << WS.isFunction()
+                 << " isWeak=" << WS.isWeak()
+                 << " isVariable=" << WS.isVariable() << "\n");
+
+    if (WS.isWeak())
+      WeakSymbols.push_back(WS.getName());
+
     unsigned Index;
+
     if (WS.isFunction()) {
       // Prepare the function's type, if we haven't seen it yet.
       WasmFunctionType F;
@@ -1062,6 +1103,14 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       int32_t Type = Pair.first->second;
 
       if (WS.isDefined(/*SetUsed=*/false)) {
+        if (WS.getOffset() != 0)
+          report_fatal_error(
+              "function sections must contain one function each");
+
+        if (WS.getSize() == 0)
+          report_fatal_error(
+              "function symbols must have a size set with .size");
+
         // A definition. Take the next available index.
         Index = NumFuncImports + Functions.size();
 
@@ -1072,6 +1121,9 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
         SymbolIndices[&WS] = Index;
         Functions.push_back(Func);
       } else {
+        // Should be no such thing as weak undefined symbol
+        assert(!WS.isVariable());
+
         // An import; the index was assigned above.
         Index = SymbolIndices.find(&WS)->second;
       }
@@ -1085,86 +1137,108 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       if (WS.isTemporary() && !WS.getSize())
         continue;
 
-      if (WS.isDefined(false)) {
-        if (WS.getOffset() != 0)
-          report_fatal_error("data sections must contain one variable each: " +
-                             WS.getName());
-        if (!WS.getSize())
-          report_fatal_error("data symbols must have a size set with .size: " +
-                             WS.getName());
-
-        int64_t Size = 0;
-        if (!WS.getSize()->evaluateAsAbsolute(Size, Layout))
-          report_fatal_error(".size expression must be evaluatable");
-
-        MCSectionWasm &DataSection =
-            static_cast<MCSectionWasm &>(WS.getSection());
-
-        if (uint64_t(Size) != Layout.getSectionFileSize(&DataSection))
-          report_fatal_error("data sections must contain at most one variable");
-
-        DataBytes.resize(alignTo(DataBytes.size(), DataSection.getAlignment()));
-
-        DataSection.setSectionOffset(DataBytes.size());
-
-        for (MCSection::iterator I = DataSection.begin(), E = DataSection.end();
-             I != E; ++I) {
-          const MCFragment &Frag = *I;
-          if (Frag.hasInstructions())
-            report_fatal_error("only data supported in data sections");
-
-          if (const MCAlignFragment *Align = dyn_cast<MCAlignFragment>(&Frag)) {
-            if (Align->getValueSize() != 1)
-              report_fatal_error("only byte values supported for alignment");
-            // If nops are requested, use zeros, as this is the data section.
-            uint8_t Value = Align->hasEmitNops() ? 0 : Align->getValue();
-            uint64_t Size = std::min<uint64_t>(alignTo(DataBytes.size(),
-                                                       Align->getAlignment()),
-                                               DataBytes.size() +
-                                                   Align->getMaxBytesToEmit());
-            DataBytes.resize(Size, Value);
-          } else if (const MCFillFragment *Fill =
-                                              dyn_cast<MCFillFragment>(&Frag)) {
-            DataBytes.insert(DataBytes.end(), Size, Fill->getValue());
-          } else {
-            const MCDataFragment &DataFrag = cast<MCDataFragment>(Frag);
-            const SmallVectorImpl<char> &Contents = DataFrag.getContents();
-
-            DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
-          }
-        }
-
-        // For each global, prepare a corresponding wasm global holding its
-        // address.  For externals these will also be named exports.
-        Index = NumGlobalImports + Globals.size();
+      if (!WS.isDefined(/*SetUsed=*/false))
+        continue;
 
-        WasmGlobal Global;
-        Global.Type = PtrType;
-        Global.IsMutable = false;
-        Global.HasImport = false;
-        Global.InitialValue = DataSection.getSectionOffset();
-        Global.ImportIndex = 0;
-        SymbolIndices[&WS] = Index;
-        Globals.push_back(Global);
+      if (WS.getOffset() != 0)
+        report_fatal_error("data sections must contain one variable each: " +
+                           WS.getName());
+      if (!WS.getSize())
+        report_fatal_error("data symbols must have a size set with .size: " +
+                           WS.getName());
+
+      int64_t Size = 0;
+      if (!WS.getSize()->evaluateAsAbsolute(Size, Layout))
+        report_fatal_error(".size expression must be evaluatable");
+
+      auto &DataSection = static_cast<MCSectionWasm &>(WS.getSection());
+
+      if (uint64_t(Size) != Layout.getSectionFileSize(&DataSection))
+        report_fatal_error("data sections must contain at most one variable");
+
+      DataBytes.resize(alignTo(DataBytes.size(), DataSection.getAlignment()));
+
+      DataSection.setSectionOffset(DataBytes.size());
+
+      for (const MCFragment &Frag : DataSection) {
+        if (Frag.hasInstructions())
+          report_fatal_error("only data supported in data sections");
+
+        if (auto *Align = dyn_cast<MCAlignFragment>(&Frag)) {
+          if (Align->getValueSize() != 1)
+            report_fatal_error("only byte values supported for alignment");
+          // If nops are requested, use zeros, as this is the data section.
+          uint8_t Value = Align->hasEmitNops() ? 0 : Align->getValue();
+          uint64_t Size = std::min<uint64_t>(alignTo(DataBytes.size(),
+                                                     Align->getAlignment()),
+                                             DataBytes.size() +
+                                                 Align->getMaxBytesToEmit());
+          DataBytes.resize(Size, Value);
+        } else if (auto *Fill = dyn_cast<MCFillFragment>(&Frag)) {
+          DataBytes.insert(DataBytes.end(), Size, Fill->getValue());
+        } else {
+          const auto &DataFrag = cast<MCDataFragment>(Frag);
+          const SmallVectorImpl<char> &Contents = DataFrag.getContents();
+
+          DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
+        }
       }
+
+      // For each global, prepare a corresponding wasm global holding its
+      // address.  For externals these will also be named exports.
+      Index = NumGlobalImports + Globals.size();
+
+      WasmGlobal Global;
+      Global.Type = PtrType;
+      Global.IsMutable = false;
+      Global.HasImport = false;
+      Global.InitialValue = DataSection.getSectionOffset();
+      Global.ImportIndex = 0;
+      SymbolIndices[&WS] = Index;
+      Globals.push_back(Global);
     }
 
     // If the symbol is visible outside this translation unit, export it.
-    if (WS.isExternal()) {
-      assert(WS.isDefined(false));
+    if (WS.isExternal() && WS.isDefined(/*SetUsed=*/false)) {
       WasmExport Export;
       Export.FieldName = WS.getName();
       Export.Index = Index;
-
       if (WS.isFunction())
         Export.Kind = wasm::WASM_EXTERNAL_FUNCTION;
       else
         Export.Kind = wasm::WASM_EXTERNAL_GLOBAL;
-
       Exports.push_back(Export);
     }
   }
 
+  // Handle weak aliases
+  for (const MCSymbol &S : Asm.symbols()) {
+    if (!S.isVariable())
+      continue;
+    assert(S.isExternal());
+    assert(S.isDefined(/*SetUsed=*/false));
+
+    const auto &WS = static_cast<const MCSymbolWasm &>(S);
+
+    // Find the target symbol of this weak alias
+    const MCExpr *Expr = WS.getVariableValue();
+    auto *Inner = dyn_cast<MCSymbolRefExpr>(Expr);
+    const auto *ResolvedSym = cast<MCSymbolWasm>(&Inner->getSymbol());
+    uint32_t Index = SymbolIndices.find(ResolvedSym)->second;
+    DEBUG(dbgs() << "Weak alias: '" << WS << "' -> '" << ResolvedSym << "' = " << Index << "\n");
+    SymbolIndices[&WS] = Index;
+
+    WasmExport Export;
+    Export.FieldName = WS.getName();
+    Export.Index = Index;
+    if (WS.isFunction())
+      Export.Kind = wasm::WASM_EXTERNAL_FUNCTION;
+    else
+      Export.Kind = wasm::WASM_EXTERNAL_GLOBAL;
+    WeakSymbols.push_back(Export.FieldName);
+    Exports.push_back(Export);
+  }
+
   // Add types for indirect function calls.
   for (const WasmRelocationEntry &Fixup : CodeRelocations) {
     if (Fixup.Type != wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB)
@@ -1198,7 +1272,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
   writeNameSection(Functions, Imports, NumFuncImports);
   writeCodeRelocSection();
   writeDataRelocSection(DataSectionHeaderSize);
-  writeLinkingMetaDataSection(HasStackPointer, StackPointerGlobal);
+  writeLinkingMetaDataSection(WeakSymbols, HasStackPointer, StackPointerGlobal);
 
   // TODO: Translate the .comment section to the output.
   // TODO: Translate debug sections to the output.
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 53dee3e8b9f36..fc5234950391b 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -735,7 +735,6 @@ void WinCOFFObjectWriter::recordRelocation(
 
   COFFSection *Sec = SectionMap[MCSec];
   const MCSymbolRefExpr *SymB = Target.getSymB();
-  bool CrossSection = false;
 
   if (SymB) {
     const MCSymbol *B = &SymB->getSymbol();
@@ -747,28 +746,9 @@ void WinCOFFObjectWriter::recordRelocation(
       return;
     }
 
-    if (!A.getFragment()) {
-      Asm.getContext().reportError(
-          Fixup.getLoc(),
-          Twine("symbol '") + A.getName() +
-              "' can not be undefined in a subtraction expression");
-      return;
-    }
-
-    CrossSection = &A.getSection() != &B->getSection();
-
     // Offset of the symbol in the section
     int64_t OffsetOfB = Layout.getSymbolOffset(*B);
 
-    // In the case where we have SymbA and SymB, we just need to store the delta
-    // between the two symbols.  Update FixedValue to account for the delta, and
-    // skip recording the relocation.
-    if (!CrossSection) {
-      int64_t OffsetOfA = Layout.getSymbolOffset(A);
-      FixedValue = (OffsetOfA - OffsetOfB) + Target.getConstant();
-      return;
-    }
-
     // Offset of the relocation in the section
     int64_t OffsetOfRelocation =
         Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
@@ -784,7 +764,7 @@ void WinCOFFObjectWriter::recordRelocation(
   Reloc.Data.VirtualAddress = Layout.getFragmentOffset(Fragment);
 
   // Turn relocations for temporary symbols into section relocations.
-  if (A.isTemporary() || CrossSection) {
+  if (A.isTemporary()) {
     MCSection *TargetSection = &A.getSection();
     assert(
         SectionMap.find(TargetSection) != SectionMap.end() &&
@@ -802,7 +782,7 @@ void WinCOFFObjectWriter::recordRelocation(
 
   Reloc.Data.VirtualAddress += Fixup.getOffset();
   Reloc.Data.Type = TargetObjectWriter->getRelocType(
-      Target, Fixup, CrossSection, Asm.getBackend());
+      Asm.getContext(), Target, Fixup, SymB, Asm.getBackend());
 
   // FIXME: Can anyone explain what this does other than adjust for the size
   // of the offset?
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 579c8dde366a0..9a760d86e7e22 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -650,6 +650,23 @@ std::error_code COFFObjectFile::initDebugDirectoryPtr() {
   return std::error_code();
 }
 
+std::error_code COFFObjectFile::initLoadConfigPtr() {
+  // Get the RVA of the debug directory. Do nothing if it does not exist.
+  const data_directory *DataEntry;
+  if (getDataDirectory(COFF::LOAD_CONFIG_TABLE, DataEntry))
+    return std::error_code();
+
+  // Do nothing if the RVA is NULL.
+  if (DataEntry->RelativeVirtualAddress == 0)
+    return std::error_code();
+  uintptr_t IntPtr = 0;
+  if (std::error_code EC = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+    return EC;
+
+  LoadConfig = (const void *)IntPtr;
+  return std::error_code();
+}
+
 COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
     : ObjectFile(Binary::ID_COFF, Object), COFFHeader(nullptr),
       COFFBigObjHeader(nullptr), PE32Header(nullptr), PE32PlusHeader(nullptr),
@@ -784,6 +801,9 @@ COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
   if ((EC = initDebugDirectoryPtr()))
     return;
 
+  if ((EC = initLoadConfigPtr()))
+    return;
+
   EC = std::error_code();
 }
 
diff --git a/lib/Object/IRSymtab.cpp b/lib/Object/IRSymtab.cpp
index a6cd5dda12d3e..7bca032a7be1d 100644
--- a/lib/Object/IRSymtab.cpp
+++ b/lib/Object/IRSymtab.cpp
@@ -46,15 +46,15 @@ namespace {
 /// Stores the temporary state that is required to build an IR symbol table.
 struct Builder {
   SmallVector<char, 0> &Symtab;
-  SmallVector<char, 0> &Strtab;
+  StringTableBuilder &StrtabBuilder;
+  StringSaver Saver;
 
-  Builder(SmallVector<char, 0> &Symtab, SmallVector<char, 0> &Strtab)
-      : Symtab(Symtab), Strtab(Strtab) {}
-
-  StringTableBuilder StrtabBuilder{StringTableBuilder::RAW};
-
-  BumpPtrAllocator Alloc;
-  StringSaver Saver{Alloc};
+  // This ctor initializes a StringSaver using the passed in BumpPtrAllocator.
+  // The StringTableBuilder does not create a copy of any strings added to it,
+  // so this provides somewhere to store any strings that we create.
+  Builder(SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder,
+          BumpPtrAllocator &Alloc)
+      : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc) {}
 
   DenseMap<const Comdat *, unsigned> ComdatMap;
   Mangler Mang;
@@ -240,7 +240,7 @@ Error Builder::build(ArrayRef<Module *> IRMods) {
       return Err;
 
   COFFLinkerOptsOS.flush();
-  setStr(Hdr.COFFLinkerOpts, COFFLinkerOpts);
+  setStr(Hdr.COFFLinkerOpts, Saver.save(COFFLinkerOpts));
 
   // We are about to fill in the header's range fields, so reserve space for it
   // and copy it in afterwards.
@@ -251,19 +251,15 @@ Error Builder::build(ArrayRef<Module *> IRMods) {
   writeRange(Hdr.Uncommons, Uncommons);
 
   *reinterpret_cast<storage::Header *>(Symtab.data()) = Hdr;
-
-  raw_svector_ostream OS(Strtab);
-  StrtabBuilder.finalizeInOrder();
-  StrtabBuilder.write(OS);
-
   return Error::success();
 }
 
 } // end anonymous namespace
 
 Error irsymtab::build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab,
-                      SmallVector<char, 0> &Strtab) {
-  return Builder(Symtab, Strtab).build(Mods);
+                      StringTableBuilder &StrtabBuilder,
+                      BumpPtrAllocator &Alloc) {
+  return Builder(Symtab, StrtabBuilder, Alloc).build(Mods);
 }
 
 // Upgrade a vector of bitcode modules created by an old version of LLVM by
@@ -285,9 +281,15 @@ static Expected<FileContents> upgrade(ArrayRef<BitcodeModule> BMs) {
     OwnedMods.push_back(std::move(*MOrErr));
   }
 
-  if (Error E = build(Mods, FC.Symtab, FC.Strtab))
+  StringTableBuilder StrtabBuilder(StringTableBuilder::RAW);
+  BumpPtrAllocator Alloc;
+  if (Error E = build(Mods, FC.Symtab, StrtabBuilder, Alloc))
     return std::move(E);
 
+  StrtabBuilder.finalizeInOrder();
+  FC.Strtab.resize(StrtabBuilder.getSize());
+  StrtabBuilder.write((uint8_t *)FC.Strtab.data());
+
   FC.TheReader = {{FC.Symtab.data(), FC.Symtab.size()},
                   {FC.Strtab.data(), FC.Strtab.size()}};
   return std::move(FC);
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 7804bbe06f83e..2e4da9f15aa13 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -1951,13 +1951,29 @@ MachOObjectFile::section_rel_end(DataRefImpl Sec) const {
   return relocation_iterator(RelocationRef(Ret, this));
 }
 
+relocation_iterator MachOObjectFile::extrel_begin() const {
+  DataRefImpl Ret;
+  Ret.d.a = 0; // Would normally be a section index.
+  Ret.d.b = 0; // Index into the external relocations
+  return relocation_iterator(RelocationRef(Ret, this));
+}
+
+relocation_iterator MachOObjectFile::extrel_end() const {
+  MachO::dysymtab_command DysymtabLoadCmd = getDysymtabLoadCommand();
+  DataRefImpl Ret;
+  Ret.d.a = 0; // Would normally be a section index.
+  Ret.d.b = DysymtabLoadCmd.nextrel; // Index into the external relocations
+  return relocation_iterator(RelocationRef(Ret, this));
+}
+
 void MachOObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
   ++Rel.d.b;
 }
 
 uint64_t MachOObjectFile::getRelocationOffset(DataRefImpl Rel) const {
-  assert(getHeader().filetype == MachO::MH_OBJECT &&
-         "Only implemented for MH_OBJECT");
+  assert((getHeader().filetype == MachO::MH_OBJECT ||
+          getHeader().filetype == MachO::MH_KEXT_BUNDLE) &&
+         "Only implemented for MH_OBJECT && MH_KEXT_BUNDLE");
   MachO::any_relocation_info RE = getRelocation(Rel);
   return getAnyRelocationAddress(RE);
 }
@@ -4086,15 +4102,20 @@ MachOObjectFile::getThreadCommand(const LoadCommandInfo &L) const {
 
 MachO::any_relocation_info
 MachOObjectFile::getRelocation(DataRefImpl Rel) const {
-  DataRefImpl Sec;
-  Sec.d.a = Rel.d.a;
   uint32_t Offset;
-  if (is64Bit()) {
-    MachO::section_64 Sect = getSection64(Sec);
-    Offset = Sect.reloff;
+  if (getHeader().filetype == MachO::MH_OBJECT) {
+    DataRefImpl Sec;
+    Sec.d.a = Rel.d.a;
+    if (is64Bit()) {
+      MachO::section_64 Sect = getSection64(Sec);
+      Offset = Sect.reloff;
+    } else {
+      MachO::section Sect = getSection(Sec);
+      Offset = Sect.reloff;
+    }
   } else {
-    MachO::section Sect = getSection(Sec);
-    Offset = Sect.reloff;
+    MachO::dysymtab_command DysymtabLoadCmd = getDysymtabLoadCommand();
+    Offset = DysymtabLoadCmd.extreloff; // Offset to the external relocations
   }
 
   auto P = reinterpret_cast<const MachO::any_relocation_info *>(
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 2304098c1dc9f..d15860674aeb9 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -28,6 +28,8 @@
 #include <cstring>
 #include <system_error>
 
+#define DEBUG_TYPE "wasm-object"
+
 using namespace llvm;
 using namespace object;
 
@@ -256,6 +258,7 @@ Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) {
   while (Ptr < End) {
     uint8_t Type = readVarint7(Ptr);
     uint32_t Size = readVaruint32(Ptr);
+    const uint8_t *SubSectionEnd = Ptr + Size;
     switch (Type) {
     case wasm::WASM_NAMES_FUNCTION: {
       uint32_t Count = readVaruint32(Ptr);
@@ -275,6 +278,9 @@ Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) {
       Ptr += Size;
       break;
     }
+    if (Ptr != SubSectionEnd)
+      return make_error<GenericBinaryError>("Name sub-section ended prematurely",
+                                            object_error::parse_failed);
   }
 
   if (Ptr != End)
@@ -283,6 +289,50 @@ Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) {
   return Error::success();
 }
 
+Error WasmObjectFile::parseLinkingSection(const uint8_t *Ptr,
+                                          const uint8_t *End) {
+  while (Ptr < End) {
+    uint8_t Type = readVarint7(Ptr);
+    uint32_t Size = readVaruint32(Ptr);
+    const uint8_t *SubSectionEnd = Ptr + Size;
+    switch (Type) {
+    case wasm::WASM_SYMBOL_INFO: {
+      uint32_t Count = readVaruint32(Ptr);
+      while (Count--) {
+        StringRef Symbol = readString(Ptr);
+        DEBUG(dbgs() << "reading syminfo: " << Symbol << "\n");
+        uint32_t Flags = readVaruint32(Ptr);
+        auto iter = SymbolMap.find(Symbol);
+        if (iter == SymbolMap.end()) {
+          return make_error<GenericBinaryError>(
+              "Invalid symbol name in linking section",
+              object_error::parse_failed);
+        }
+        uint32_t SymIndex = iter->second;
+        assert(SymIndex < Symbols.size());
+        Symbols[SymIndex].Flags = Flags;
+        DEBUG(dbgs() << "Set symbol flags index:"
+                     << SymIndex << " name:"
+                     << Symbols[SymIndex].Name << " exptected:"
+                     << Symbol << " flags: " << Flags << "\n");
+      }
+      break;
+    }
+    case wasm::WASM_STACK_POINTER:
+    default:
+      Ptr += Size;
+      break;
+    }
+    if (Ptr != SubSectionEnd)
+      return make_error<GenericBinaryError>(
+          "Linking sub-section ended prematurely", object_error::parse_failed);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Linking section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
 WasmSection* WasmObjectFile::findCustomSectionByName(StringRef Name) {
   for (WasmSection& Section : Sections) {
     if (Section.Type == wasm::WASM_SEC_CUSTOM && Section.Name == Name)
@@ -325,6 +375,7 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, const uint8_t *Ptr,
     case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
     case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
     case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
       break;
     case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB:
     case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
@@ -332,7 +383,8 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, const uint8_t *Ptr,
       Reloc.Addend = readVarint32(Ptr);
       break;
     default:
-      return make_error<GenericBinaryError>("Bad relocation type",
+      return make_error<GenericBinaryError>("Bad relocation type: " +
+                                                Twine(Reloc.Type),
                                             object_error::parse_failed);
     }
     Section->Relocations.push_back(Reloc);
@@ -349,6 +401,9 @@ Error WasmObjectFile::parseCustomSection(WasmSection &Sec,
   if (Sec.Name == "name") {
     if (Error Err = parseNameSection(Ptr, End))
       return Err;
+  } else if (Sec.Name == "linking") {
+    if (Error Err = parseLinkingSection(Ptr, End))
+      return Err;
   } else if (Sec.Name.startswith("reloc.")) {
     if (Error Err = parseRelocSection(Sec.Name, Ptr, End))
       return Err;
@@ -400,14 +455,20 @@ Error WasmObjectFile::parseImportSection(const uint8_t *Ptr, const uint8_t *End)
     switch (Im.Kind) {
     case wasm::WASM_EXTERNAL_FUNCTION:
       Im.SigIndex = readVaruint32(Ptr);
+      SymbolMap.try_emplace(Im.Field, Symbols.size());
       Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::FUNCTION_IMPORT,
                            Sections.size(), i);
+      DEBUG(dbgs() << "Adding import: " << Symbols.back()
+                   << " sym index:" << Symbols.size() << "\n");
       break;
     case wasm::WASM_EXTERNAL_GLOBAL:
       Im.Global.Type = readVarint7(Ptr);
       Im.Global.Mutable = readVaruint1(Ptr);
+      SymbolMap.try_emplace(Im.Field, Symbols.size());
       Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT,
                            Sections.size(), i);
+      DEBUG(dbgs() << "Adding import: " << Symbols.back()
+                   << " sym index:" << Symbols.size() << "\n");
       break;
     case wasm::WASM_EXTERNAL_MEMORY:
       Im.Memory = readLimits(Ptr);
@@ -496,15 +557,20 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End)
     Ex.Name = readString(Ptr);
     Ex.Kind = readUint8(Ptr);
     Ex.Index = readVaruint32(Ptr);
-    Exports.push_back(Ex);
     switch (Ex.Kind) {
     case wasm::WASM_EXTERNAL_FUNCTION:
+      SymbolMap.try_emplace(Ex.Name, Symbols.size());
       Symbols.emplace_back(Ex.Name, WasmSymbol::SymbolType::FUNCTION_EXPORT,
                            Sections.size(), i);
+      DEBUG(dbgs() << "Adding export: " << Symbols.back()
+                   << " sym index:" << Symbols.size() << "\n");
       break;
     case wasm::WASM_EXTERNAL_GLOBAL:
+      SymbolMap.try_emplace(Ex.Name, Symbols.size());
       Symbols.emplace_back(Ex.Name, WasmSymbol::SymbolType::GLOBAL_EXPORT,
                            Sections.size(), i);
+      DEBUG(dbgs() << "Adding export: " << Symbols.back()
+                   << " sym index:" << Symbols.size() << "\n");
       break;
     case wasm::WASM_EXTERNAL_MEMORY:
     case wasm::WASM_EXTERNAL_TABLE:
@@ -513,6 +579,7 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End)
       return make_error<GenericBinaryError>(
           "Unexpected export kind", object_error::parse_failed);
     }
+    Exports.push_back(Ex);
   }
   if (Ptr != End)
     return make_error<GenericBinaryError>("Export section ended prematurely",
@@ -620,6 +687,10 @@ uint32_t WasmObjectFile::getSymbolFlags(DataRefImpl Symb) const {
   uint32_t Result = SymbolRef::SF_None;
   const WasmSymbol &Sym = getWasmSymbol(Symb);
 
+  DEBUG(dbgs() << "getSymbolFlags: ptr=" << &Sym << " " << Sym << "\n");
+  if (Sym.Flags & wasm::WASM_SYMBOL_FLAG_WEAK)
+    Result |= SymbolRef::SF_Weak;
+
   switch (Sym.Type) {
   case WasmSymbol::SymbolType::FUNCTION_IMPORT:
     Result |= SymbolRef::SF_Undefined | SymbolRef::SF_Executable;
@@ -629,6 +700,7 @@ uint32_t WasmObjectFile::getSymbolFlags(DataRefImpl Symb) const {
     break;
   case WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME:
     Result |= SymbolRef::SF_Executable;
+    Result |= SymbolRef::SF_FormatSpecific;
     break;
   case WasmSymbol::SymbolType::GLOBAL_IMPORT:
     Result |= SymbolRef::SF_Undefined;
@@ -662,8 +734,7 @@ const WasmSymbol &WasmObjectFile::getWasmSymbol(const SymbolRef &Symb) const {
 }
 
 Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
-  const WasmSymbol &Sym = getWasmSymbol(Symb);
-  return Sym.Name;
+  return getWasmSymbol(Symb).Name;
 }
 
 Expected<uint64_t> WasmObjectFile::getSymbolAddress(DataRefImpl Symb) const {
@@ -671,8 +742,18 @@ Expected<uint64_t> WasmObjectFile::getSymbolAddress(DataRefImpl Symb) const {
 }
 
 uint64_t WasmObjectFile::getSymbolValueImpl(DataRefImpl Symb) const {
-  const WasmSymbol &Sym = getWasmSymbol(Symb);
-  return Sym.ElementIndex;
+  const WasmSymbol& Sym = getWasmSymbol(Symb);
+  switch (Sym.Type) {
+  case WasmSymbol::SymbolType::FUNCTION_IMPORT:
+  case WasmSymbol::SymbolType::GLOBAL_IMPORT:
+    return 0;
+  case WasmSymbol::SymbolType::FUNCTION_EXPORT:
+  case WasmSymbol::SymbolType::GLOBAL_EXPORT:
+    return Exports[Sym.ElementIndex].Index;
+  case WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME:
+    return Sym.ElementIndex;
+  }
+  llvm_unreachable("invalid symbol type");
 }
 
 uint32_t WasmObjectFile::getSymbolAlignment(DataRefImpl Symb) const {
diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp
index 3f6080d48f9d1..ff9b9ca35eb5b 100644
--- a/lib/Object/WindowsResource.cpp
+++ b/lib/Object/WindowsResource.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/WindowsResource.h"
-#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/MathExtras.h"
@@ -21,6 +20,9 @@
 #include <sstream>
 #include <system_error>
 
+using namespace llvm;
+using namespace object;
+
 namespace llvm {
 namespace object {
 
@@ -34,23 +36,19 @@ const uint32_t MIN_HEADER_SIZE = 7 * sizeof(uint32_t) + 2 * sizeof(uint16_t);
 // 8-byte because it makes everyone happy.
 const uint32_t SECTION_ALIGNMENT = sizeof(uint64_t);
 
-static const size_t ResourceMagicSize = 16;
-
-static const size_t NullEntrySize = 16;
-
 uint32_t WindowsResourceParser::TreeNode::StringCount = 0;
 uint32_t WindowsResourceParser::TreeNode::DataCount = 0;
 
 WindowsResource::WindowsResource(MemoryBufferRef Source)
     : Binary(Binary::ID_WinRes, Source) {
-  size_t LeadingSize = ResourceMagicSize + NullEntrySize;
+  size_t LeadingSize = WIN_RES_MAGIC_SIZE + WIN_RES_NULL_ENTRY_SIZE;
   BBS = BinaryByteStream(Data.getBuffer().drop_front(LeadingSize),
                          support::little);
 }
 
 Expected<std::unique_ptr<WindowsResource>>
 WindowsResource::createWindowsResource(MemoryBufferRef Source) {
-  if (Source.getBufferSize() < ResourceMagicSize + NullEntrySize)
+  if (Source.getBufferSize() < WIN_RES_MAGIC_SIZE + WIN_RES_NULL_ENTRY_SIZE)
     return make_error<GenericBinaryError>(
         "File too small to be a resource file",
         object_error::invalid_file_type);
@@ -103,12 +101,10 @@ static Error readStringOrId(BinaryStreamReader &Reader, uint16_t &ID,
 }
 
 Error ResourceEntryRef::loadNext() {
-  uint32_t DataSize;
-  RETURN_IF_ERROR(Reader.readInteger(DataSize));
-  uint32_t HeaderSize;
-  RETURN_IF_ERROR(Reader.readInteger(HeaderSize));
+  const WinResHeaderPrefix *Prefix;
+  RETURN_IF_ERROR(Reader.readObject(Prefix));
 
-  if (HeaderSize < MIN_HEADER_SIZE)
+  if (Prefix->HeaderSize < MIN_HEADER_SIZE)
     return make_error<GenericBinaryError>("Header size is too small.",
                                           object_error::parse_failed);
 
@@ -116,13 +112,13 @@ Error ResourceEntryRef::loadNext() {
 
   RETURN_IF_ERROR(readStringOrId(Reader, NameID, Name, IsStringName));
 
-  RETURN_IF_ERROR(Reader.padToAlignment(sizeof(uint32_t)));
+  RETURN_IF_ERROR(Reader.padToAlignment(WIN_RES_HEADER_ALIGNMENT));
 
   RETURN_IF_ERROR(Reader.readObject(Suffix));
 
-  RETURN_IF_ERROR(Reader.readArray(Data, DataSize));
+  RETURN_IF_ERROR(Reader.readArray(Data, Prefix->DataSize));
 
-  RETURN_IF_ERROR(Reader.padToAlignment(sizeof(uint32_t)));
+  RETURN_IF_ERROR(Reader.padToAlignment(WIN_RES_DATA_ALIGNMENT));
 
   return Error::success();
 }
@@ -246,14 +242,14 @@ WindowsResourceParser::TreeNode::addChild(ArrayRef<UTF16> NameRef,
   std::string NameString;
   ArrayRef<UTF16> CorrectedName;
   std::vector<UTF16> EndianCorrectedName;
-  if (llvm::sys::IsBigEndianHost) {
+  if (sys::IsBigEndianHost) {
     EndianCorrectedName.resize(NameRef.size() + 1);
     std::copy(NameRef.begin(), NameRef.end(), EndianCorrectedName.begin() + 1);
     EndianCorrectedName[0] = UNI_UTF16_BYTE_ORDER_MARK_SWAPPED;
     CorrectedName = makeArrayRef(EndianCorrectedName);
   } else
     CorrectedName = NameRef;
-  llvm::convertUTF16ToUTF8String(CorrectedName, NameString);
+  convertUTF16ToUTF8String(CorrectedName, NameString);
 
   auto Child = StringChildren.find(NameString);
   if (Child == StringChildren.end()) {
@@ -282,17 +278,17 @@ void WindowsResourceParser::TreeNode::print(ScopedPrinter &Writer,
 // the directory strings or the relocations of the .rsrc section.
 uint32_t WindowsResourceParser::TreeNode::getTreeSize() const {
   uint32_t Size = (IDChildren.size() + StringChildren.size()) *
-                  sizeof(llvm::object::coff_resource_dir_entry);
+                  sizeof(coff_resource_dir_entry);
 
   // Reached a node pointing to a data entry.
   if (IsDataNode) {
-    Size += sizeof(llvm::object::coff_resource_data_entry);
+    Size += sizeof(coff_resource_data_entry);
     return Size;
   }
 
   // If the node does not point to data, it must have a directory table pointing
   // to other nodes.
-  Size += sizeof(llvm::object::coff_resource_dir_table);
+  Size += sizeof(coff_resource_dir_table);
 
   for (auto const &Child : StringChildren) {
     Size += Child.second->getTreeSize();
@@ -305,9 +301,9 @@ uint32_t WindowsResourceParser::TreeNode::getTreeSize() const {
 
 class WindowsResourceCOFFWriter {
 public:
-  WindowsResourceCOFFWriter(StringRef OutputFile, Machine MachineType,
+  WindowsResourceCOFFWriter(COFF::MachineTypes MachineType,
                             const WindowsResourceParser &Parser, Error &E);
-  Error write();
+  std::unique_ptr<MemoryBuffer> write();
 
 private:
   void performFileLayout();
@@ -323,10 +319,10 @@ private:
   void writeDirectoryTree();
   void writeDirectoryStringTable();
   void writeFirstSectionRelocations();
-  std::unique_ptr<FileOutputBuffer> Buffer;
-  uint8_t *BufferStart;
+  std::unique_ptr<MemoryBuffer> OutputBuffer;
+  char *BufferStart;
   uint64_t CurrentOffset = 0;
-  Machine MachineType;
+  COFF::MachineTypes MachineType;
   const WindowsResourceParser::TreeNode &Resources;
   const ArrayRef<std::vector<uint8_t>> Data;
   uint64_t FileSize;
@@ -343,28 +339,21 @@ private:
 };
 
 WindowsResourceCOFFWriter::WindowsResourceCOFFWriter(
-    StringRef OutputFile, Machine MachineType,
-    const WindowsResourceParser &Parser, Error &E)
+    COFF::MachineTypes MachineType, const WindowsResourceParser &Parser,
+    Error &E)
     : MachineType(MachineType), Resources(Parser.getTree()),
       Data(Parser.getData()), StringTable(Parser.getStringTable()) {
   performFileLayout();
 
-  ErrorOr<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
-      FileOutputBuffer::create(OutputFile, FileSize);
-  if (!BufferOrErr) {
-    E = errorCodeToError(BufferOrErr.getError());
-    return;
-  }
-
-  Buffer = std::move(*BufferOrErr);
+  OutputBuffer = MemoryBuffer::getNewMemBuffer(FileSize);
 }
 
 void WindowsResourceCOFFWriter::performFileLayout() {
   // Add size of COFF header.
-  FileSize = llvm::COFF::Header16Size;
+  FileSize = COFF::Header16Size;
 
   // one .rsrc section header for directory tree, another for resource data.
-  FileSize += 2 * llvm::COFF::SectionSize;
+  FileSize += 2 * COFF::SectionSize;
 
   performSectionOneLayout();
   performSectionTwoLayout();
@@ -372,9 +361,9 @@ void WindowsResourceCOFFWriter::performFileLayout() {
   // We have reached the address of the symbol table.
   SymbolTableOffset = FileSize;
 
-  FileSize += llvm::COFF::Symbol16Size;     // size of the @feat.00 symbol.
-  FileSize += 4 * llvm::COFF::Symbol16Size; // symbol + aux for each section.
-  FileSize += Data.size() * llvm::COFF::Symbol16Size; // 1 symbol per resource.
+  FileSize += COFF::Symbol16Size;     // size of the @feat.00 symbol.
+  FileSize += 4 * COFF::Symbol16Size; // symbol + aux for each section.
+  FileSize += Data.size() * COFF::Symbol16Size; // 1 symbol per resource.
   FileSize += 4; // four null bytes for the string table.
 }
 
@@ -395,8 +384,8 @@ void WindowsResourceCOFFWriter::performSectionOneLayout() {
   // account for the relocations of section one.
   SectionOneRelocations = FileSize + SectionOneSize;
   FileSize += SectionOneSize;
-  FileSize += Data.size() *
-              llvm::COFF::RelocationSize; // one relocation for each resource.
+  FileSize +=
+      Data.size() * COFF::RelocationSize; // one relocation for each resource.
   FileSize = alignTo(FileSize, SECTION_ALIGNMENT);
 }
 
@@ -407,7 +396,7 @@ void WindowsResourceCOFFWriter::performSectionTwoLayout() {
   SectionTwoSize = 0;
   for (auto const &Entry : Data) {
     DataOffsets.push_back(SectionTwoSize);
-    SectionTwoSize += llvm::alignTo(Entry.size(), sizeof(uint64_t));
+    SectionTwoSize += alignTo(Entry.size(), sizeof(uint64_t));
   }
   FileSize += SectionTwoSize;
   FileSize = alignTo(FileSize, SECTION_ALIGNMENT);
@@ -420,8 +409,8 @@ static std::time_t getTime() {
   return Now;
 }
 
-Error WindowsResourceCOFFWriter::write() {
-  BufferStart = Buffer->getBufferStart();
+std::unique_ptr<MemoryBuffer> WindowsResourceCOFFWriter::write() {
+  BufferStart = const_cast<char *>(OutputBuffer->getBufferStart());
 
   writeCOFFHeader();
   writeFirstSectionHeader();
@@ -431,29 +420,24 @@ Error WindowsResourceCOFFWriter::write() {
   writeSymbolTable();
   writeStringTable();
 
-  if (auto EC = Buffer->commit()) {
-    return errorCodeToError(EC);
-  }
-
-  return Error::success();
+  return std::move(OutputBuffer);
 }
 
 void WindowsResourceCOFFWriter::writeCOFFHeader() {
   // Write the COFF header.
-  auto *Header =
-      reinterpret_cast<llvm::object::coff_file_header *>(BufferStart);
+  auto *Header = reinterpret_cast<coff_file_header *>(BufferStart);
   switch (MachineType) {
-  case Machine::ARM:
-    Header->Machine = llvm::COFF::IMAGE_FILE_MACHINE_ARMNT;
+  case COFF::IMAGE_FILE_MACHINE_ARMNT:
+    Header->Machine = COFF::IMAGE_FILE_MACHINE_ARMNT;
     break;
-  case Machine::X64:
-    Header->Machine = llvm::COFF::IMAGE_FILE_MACHINE_AMD64;
+  case COFF::IMAGE_FILE_MACHINE_AMD64:
+    Header->Machine = COFF::IMAGE_FILE_MACHINE_AMD64;
     break;
-  case Machine::X86:
-    Header->Machine = llvm::COFF::IMAGE_FILE_MACHINE_I386;
+  case COFF::IMAGE_FILE_MACHINE_I386:
+    Header->Machine = COFF::IMAGE_FILE_MACHINE_I386;
     break;
   default:
-    Header->Machine = llvm::COFF::IMAGE_FILE_MACHINE_UNKNOWN;
+    Header->Machine = COFF::IMAGE_FILE_MACHINE_UNKNOWN;
   }
   Header->NumberOfSections = 2;
   Header->TimeDateStamp = getTime();
@@ -461,15 +445,15 @@ void WindowsResourceCOFFWriter::writeCOFFHeader() {
   // One symbol for every resource plus 2 for each section and @feat.00
   Header->NumberOfSymbols = Data.size() + 5;
   Header->SizeOfOptionalHeader = 0;
-  Header->Characteristics = llvm::COFF::IMAGE_FILE_32BIT_MACHINE;
+  Header->Characteristics = COFF::IMAGE_FILE_32BIT_MACHINE;
 }
 
 void WindowsResourceCOFFWriter::writeFirstSectionHeader() {
   // Write the first section header.
-  CurrentOffset += sizeof(llvm::object::coff_file_header);
-  auto *SectionOneHeader = reinterpret_cast<llvm::object::coff_section *>(
-      BufferStart + CurrentOffset);
-  strncpy(SectionOneHeader->Name, ".rsrc$01", (size_t)llvm::COFF::NameSize);
+  CurrentOffset += sizeof(coff_file_header);
+  auto *SectionOneHeader =
+      reinterpret_cast<coff_section *>(BufferStart + CurrentOffset);
+  strncpy(SectionOneHeader->Name, ".rsrc$01", (size_t)COFF::NameSize);
   SectionOneHeader->VirtualSize = 0;
   SectionOneHeader->VirtualAddress = 0;
   SectionOneHeader->SizeOfRawData = SectionOneSize;
@@ -478,19 +462,16 @@ void WindowsResourceCOFFWriter::writeFirstSectionHeader() {
   SectionOneHeader->PointerToLinenumbers = 0;
   SectionOneHeader->NumberOfRelocations = Data.size();
   SectionOneHeader->NumberOfLinenumbers = 0;
-  SectionOneHeader->Characteristics = llvm::COFF::IMAGE_SCN_ALIGN_1BYTES;
-  SectionOneHeader->Characteristics +=
-      llvm::COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
-  SectionOneHeader->Characteristics += llvm::COFF::IMAGE_SCN_MEM_DISCARDABLE;
-  SectionOneHeader->Characteristics += llvm::COFF::IMAGE_SCN_MEM_READ;
+  SectionOneHeader->Characteristics += COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
+  SectionOneHeader->Characteristics += COFF::IMAGE_SCN_MEM_READ;
 }
 
 void WindowsResourceCOFFWriter::writeSecondSectionHeader() {
   // Write the second section header.
-  CurrentOffset += sizeof(llvm::object::coff_section);
-  auto *SectionTwoHeader = reinterpret_cast<llvm::object::coff_section *>(
-      BufferStart + CurrentOffset);
-  strncpy(SectionTwoHeader->Name, ".rsrc$02", (size_t)llvm::COFF::NameSize);
+  CurrentOffset += sizeof(coff_section);
+  auto *SectionTwoHeader =
+      reinterpret_cast<coff_section *>(BufferStart + CurrentOffset);
+  strncpy(SectionTwoHeader->Name, ".rsrc$02", (size_t)COFF::NameSize);
   SectionTwoHeader->VirtualSize = 0;
   SectionTwoHeader->VirtualAddress = 0;
   SectionTwoHeader->SizeOfRawData = SectionTwoSize;
@@ -499,14 +480,13 @@ void WindowsResourceCOFFWriter::writeSecondSectionHeader() {
   SectionTwoHeader->PointerToLinenumbers = 0;
   SectionTwoHeader->NumberOfRelocations = 0;
   SectionTwoHeader->NumberOfLinenumbers = 0;
-  SectionTwoHeader->Characteristics =
-      llvm::COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
-  SectionTwoHeader->Characteristics += llvm::COFF::IMAGE_SCN_MEM_READ;
+  SectionTwoHeader->Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
+  SectionTwoHeader->Characteristics += COFF::IMAGE_SCN_MEM_READ;
 }
 
 void WindowsResourceCOFFWriter::writeFirstSection() {
   // Write section one.
-  CurrentOffset += sizeof(llvm::object::coff_section);
+  CurrentOffset += sizeof(coff_section);
 
   writeDirectoryTree();
   writeDirectoryStringTable();
@@ -529,70 +509,65 @@ void WindowsResourceCOFFWriter::writeSecondSection() {
 void WindowsResourceCOFFWriter::writeSymbolTable() {
   // Now write the symbol table.
   // First, the feat symbol.
-  auto *Symbol = reinterpret_cast<llvm::object::coff_symbol16 *>(BufferStart +
-                                                                 CurrentOffset);
-  strncpy(Symbol->Name.ShortName, "@feat.00", (size_t)llvm::COFF::NameSize);
+  auto *Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
+  strncpy(Symbol->Name.ShortName, "@feat.00", (size_t)COFF::NameSize);
   Symbol->Value = 0x11;
   Symbol->SectionNumber = 0xffff;
-  Symbol->Type = llvm::COFF::IMAGE_SYM_DTYPE_NULL;
-  Symbol->StorageClass = llvm::COFF::IMAGE_SYM_CLASS_STATIC;
+  Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
+  Symbol->StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
   Symbol->NumberOfAuxSymbols = 0;
-  CurrentOffset += sizeof(llvm::object::coff_symbol16);
+  CurrentOffset += sizeof(coff_symbol16);
 
   // Now write the .rsrc1 symbol + aux.
-  Symbol = reinterpret_cast<llvm::object::coff_symbol16 *>(BufferStart +
-                                                           CurrentOffset);
-  strncpy(Symbol->Name.ShortName, ".rsrc$01", (size_t)llvm::COFF::NameSize);
+  Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
+  strncpy(Symbol->Name.ShortName, ".rsrc$01", (size_t)COFF::NameSize);
   Symbol->Value = 0;
   Symbol->SectionNumber = 1;
-  Symbol->Type = llvm::COFF::IMAGE_SYM_DTYPE_NULL;
-  Symbol->StorageClass = llvm::COFF::IMAGE_SYM_CLASS_STATIC;
+  Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
+  Symbol->StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
   Symbol->NumberOfAuxSymbols = 1;
-  CurrentOffset += sizeof(llvm::object::coff_symbol16);
-  auto *Aux = reinterpret_cast<llvm::object::coff_aux_section_definition *>(
-      BufferStart + CurrentOffset);
+  CurrentOffset += sizeof(coff_symbol16);
+  auto *Aux = reinterpret_cast<coff_aux_section_definition *>(BufferStart +
+                                                              CurrentOffset);
   Aux->Length = SectionOneSize;
   Aux->NumberOfRelocations = Data.size();
   Aux->NumberOfLinenumbers = 0;
   Aux->CheckSum = 0;
   Aux->NumberLowPart = 0;
   Aux->Selection = 0;
-  CurrentOffset += sizeof(llvm::object::coff_aux_section_definition);
+  CurrentOffset += sizeof(coff_aux_section_definition);
 
   // Now write the .rsrc2 symbol + aux.
-  Symbol = reinterpret_cast<llvm::object::coff_symbol16 *>(BufferStart +
-                                                           CurrentOffset);
-  strncpy(Symbol->Name.ShortName, ".rsrc$02", (size_t)llvm::COFF::NameSize);
+  Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
+  strncpy(Symbol->Name.ShortName, ".rsrc$02", (size_t)COFF::NameSize);
   Symbol->Value = 0;
   Symbol->SectionNumber = 2;
-  Symbol->Type = llvm::COFF::IMAGE_SYM_DTYPE_NULL;
-  Symbol->StorageClass = llvm::COFF::IMAGE_SYM_CLASS_STATIC;
+  Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
+  Symbol->StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
   Symbol->NumberOfAuxSymbols = 1;
-  CurrentOffset += sizeof(llvm::object::coff_symbol16);
-  Aux = reinterpret_cast<llvm::object::coff_aux_section_definition *>(
-      BufferStart + CurrentOffset);
+  CurrentOffset += sizeof(coff_symbol16);
+  Aux = reinterpret_cast<coff_aux_section_definition *>(BufferStart +
+                                                        CurrentOffset);
   Aux->Length = SectionTwoSize;
   Aux->NumberOfRelocations = 0;
   Aux->NumberOfLinenumbers = 0;
   Aux->CheckSum = 0;
   Aux->NumberLowPart = 0;
   Aux->Selection = 0;
-  CurrentOffset += sizeof(llvm::object::coff_aux_section_definition);
+  CurrentOffset += sizeof(coff_aux_section_definition);
 
   // Now write a symbol for each relocation.
   for (unsigned i = 0; i < Data.size(); i++) {
     char RelocationName[9];
     sprintf(RelocationName, "$R%06X", DataOffsets[i]);
-    Symbol = reinterpret_cast<llvm::object::coff_symbol16 *>(BufferStart +
-                                                             CurrentOffset);
-    strncpy(Symbol->Name.ShortName, RelocationName,
-            (size_t)llvm::COFF::NameSize);
+    Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
+    strncpy(Symbol->Name.ShortName, RelocationName, (size_t)COFF::NameSize);
     Symbol->Value = DataOffsets[i];
     Symbol->SectionNumber = 1;
-    Symbol->Type = llvm::COFF::IMAGE_SYM_DTYPE_NULL;
-    Symbol->StorageClass = llvm::COFF::IMAGE_SYM_CLASS_STATIC;
+    Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
+    Symbol->StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
     Symbol->NumberOfAuxSymbols = 0;
-    CurrentOffset += sizeof(llvm::object::coff_symbol16);
+    CurrentOffset += sizeof(coff_symbol16);
   }
 }
 
@@ -607,18 +582,18 @@ void WindowsResourceCOFFWriter::writeDirectoryTree() {
   // COFF objects.
   std::queue<const WindowsResourceParser::TreeNode *> Queue;
   Queue.push(&Resources);
-  uint32_t NextLevelOffset = sizeof(llvm::object::coff_resource_dir_table) +
-                             (Resources.getStringChildren().size() +
-                              Resources.getIDChildren().size()) *
-                                 sizeof(llvm::object::coff_resource_dir_entry);
+  uint32_t NextLevelOffset =
+      sizeof(coff_resource_dir_table) + (Resources.getStringChildren().size() +
+                                         Resources.getIDChildren().size()) *
+                                            sizeof(coff_resource_dir_entry);
   std::vector<const WindowsResourceParser::TreeNode *> DataEntriesTreeOrder;
   uint32_t CurrentRelativeOffset = 0;
 
   while (!Queue.empty()) {
     auto CurrentNode = Queue.front();
     Queue.pop();
-    auto *Table = reinterpret_cast<llvm::object::coff_resource_dir_table *>(
-        BufferStart + CurrentOffset);
+    auto *Table = reinterpret_cast<coff_resource_dir_table *>(BufferStart +
+                                                              CurrentOffset);
     Table->Characteristics = CurrentNode->getCharacteristics();
     Table->TimeDateStamp = 0;
     Table->MajorVersion = CurrentNode->getMajorVersion();
@@ -627,63 +602,63 @@ void WindowsResourceCOFFWriter::writeDirectoryTree() {
     auto &StringChildren = CurrentNode->getStringChildren();
     Table->NumberOfNameEntries = StringChildren.size();
     Table->NumberOfIDEntries = IDChildren.size();
-    CurrentOffset += sizeof(llvm::object::coff_resource_dir_table);
-    CurrentRelativeOffset += sizeof(llvm::object::coff_resource_dir_table);
+    CurrentOffset += sizeof(coff_resource_dir_table);
+    CurrentRelativeOffset += sizeof(coff_resource_dir_table);
 
     // Write the directory entries immediately following each directory table.
     for (auto const &Child : StringChildren) {
-      auto *Entry = reinterpret_cast<llvm::object::coff_resource_dir_entry *>(
-          BufferStart + CurrentOffset);
+      auto *Entry = reinterpret_cast<coff_resource_dir_entry *>(BufferStart +
+                                                                CurrentOffset);
       Entry->Identifier.NameOffset =
           StringTableOffsets[Child.second->getStringIndex()];
       if (Child.second->checkIsDataNode()) {
         Entry->Offset.DataEntryOffset = NextLevelOffset;
-        NextLevelOffset += sizeof(llvm::object::coff_resource_data_entry);
+        NextLevelOffset += sizeof(coff_resource_data_entry);
         DataEntriesTreeOrder.push_back(Child.second.get());
       } else {
         Entry->Offset.SubdirOffset = NextLevelOffset + (1 << 31);
-        NextLevelOffset += sizeof(llvm::object::coff_resource_dir_table) +
+        NextLevelOffset += sizeof(coff_resource_dir_table) +
                            (Child.second->getStringChildren().size() +
                             Child.second->getIDChildren().size()) *
-                               sizeof(llvm::object::coff_resource_dir_entry);
+                               sizeof(coff_resource_dir_entry);
         Queue.push(Child.second.get());
       }
-      CurrentOffset += sizeof(llvm::object::coff_resource_dir_entry);
-      CurrentRelativeOffset += sizeof(llvm::object::coff_resource_dir_entry);
+      CurrentOffset += sizeof(coff_resource_dir_entry);
+      CurrentRelativeOffset += sizeof(coff_resource_dir_entry);
     }
     for (auto const &Child : IDChildren) {
-      auto *Entry = reinterpret_cast<llvm::object::coff_resource_dir_entry *>(
-          BufferStart + CurrentOffset);
+      auto *Entry = reinterpret_cast<coff_resource_dir_entry *>(BufferStart +
+                                                                CurrentOffset);
       Entry->Identifier.ID = Child.first;
       if (Child.second->checkIsDataNode()) {
         Entry->Offset.DataEntryOffset = NextLevelOffset;
-        NextLevelOffset += sizeof(llvm::object::coff_resource_data_entry);
+        NextLevelOffset += sizeof(coff_resource_data_entry);
         DataEntriesTreeOrder.push_back(Child.second.get());
       } else {
         Entry->Offset.SubdirOffset = NextLevelOffset + (1 << 31);
-        NextLevelOffset += sizeof(llvm::object::coff_resource_dir_table) +
+        NextLevelOffset += sizeof(coff_resource_dir_table) +
                            (Child.second->getStringChildren().size() +
                             Child.second->getIDChildren().size()) *
-                               sizeof(llvm::object::coff_resource_dir_entry);
+                               sizeof(coff_resource_dir_entry);
         Queue.push(Child.second.get());
       }
-      CurrentOffset += sizeof(llvm::object::coff_resource_dir_entry);
-      CurrentRelativeOffset += sizeof(llvm::object::coff_resource_dir_entry);
+      CurrentOffset += sizeof(coff_resource_dir_entry);
+      CurrentRelativeOffset += sizeof(coff_resource_dir_entry);
     }
   }
 
   RelocationAddresses.resize(Data.size());
   // Now write all the resource data entries.
   for (auto DataNodes : DataEntriesTreeOrder) {
-    auto *Entry = reinterpret_cast<llvm::object::coff_resource_data_entry *>(
-        BufferStart + CurrentOffset);
+    auto *Entry = reinterpret_cast<coff_resource_data_entry *>(BufferStart +
+                                                               CurrentOffset);
     RelocationAddresses[DataNodes->getDataIndex()] = CurrentRelativeOffset;
     Entry->DataRVA = 0; // Set to zero because it is a relocation.
     Entry->DataSize = Data[DataNodes->getDataIndex()].size();
     Entry->Codepage = 0;
     Entry->Reserved = 0;
-    CurrentOffset += sizeof(llvm::object::coff_resource_data_entry);
-    CurrentRelativeOffset += sizeof(llvm::object::coff_resource_data_entry);
+    CurrentOffset += sizeof(coff_resource_data_entry);
+    CurrentRelativeOffset += sizeof(coff_resource_data_entry);
   }
 }
 
@@ -710,33 +685,34 @@ void WindowsResourceCOFFWriter::writeFirstSectionRelocations() {
   // .rsrc section.
   uint32_t NextSymbolIndex = 5;
   for (unsigned i = 0; i < Data.size(); i++) {
-    auto *Reloc = reinterpret_cast<llvm::object::coff_relocation *>(
-        BufferStart + CurrentOffset);
+    auto *Reloc =
+        reinterpret_cast<coff_relocation *>(BufferStart + CurrentOffset);
     Reloc->VirtualAddress = RelocationAddresses[i];
     Reloc->SymbolTableIndex = NextSymbolIndex++;
     switch (MachineType) {
-    case Machine::ARM:
-      Reloc->Type = llvm::COFF::IMAGE_REL_ARM_ADDR32NB;
+    case COFF::IMAGE_FILE_MACHINE_ARMNT:
+      Reloc->Type = COFF::IMAGE_REL_ARM_ADDR32NB;
       break;
-    case Machine::X64:
-      Reloc->Type = llvm::COFF::IMAGE_REL_AMD64_ADDR32NB;
+    case COFF::IMAGE_FILE_MACHINE_AMD64:
+      Reloc->Type = COFF::IMAGE_REL_AMD64_ADDR32NB;
       break;
-    case Machine::X86:
-      Reloc->Type = llvm::COFF::IMAGE_REL_I386_DIR32NB;
+    case COFF::IMAGE_FILE_MACHINE_I386:
+      Reloc->Type = COFF::IMAGE_REL_I386_DIR32NB;
       break;
     default:
       Reloc->Type = 0;
     }
-    CurrentOffset += sizeof(llvm::object::coff_relocation);
+    CurrentOffset += sizeof(coff_relocation);
   }
 }
 
-Error writeWindowsResourceCOFF(StringRef OutputFile, Machine MachineType,
-                               const WindowsResourceParser &Parser) {
+Expected<std::unique_ptr<MemoryBuffer>>
+writeWindowsResourceCOFF(COFF::MachineTypes MachineType,
+                         const WindowsResourceParser &Parser) {
   Error E = Error::success();
-  WindowsResourceCOFFWriter Writer(OutputFile, MachineType, Parser, E);
+  WindowsResourceCOFFWriter Writer(MachineType, Parser, E);
   if (E)
-    return E;
+    return std::move(E);
   return Writer.write();
 }
 
diff --git a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
index ba3a2abe20978..83f3d55b8e556 100644
--- a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -40,6 +40,7 @@ LLVM_YAML_DECLARE_ENUM_TRAITS(FrameCookieKind)
 LLVM_YAML_DECLARE_BITSET_TRAITS(CompileSym2Flags)
 LLVM_YAML_DECLARE_BITSET_TRAITS(CompileSym3Flags)
 LLVM_YAML_DECLARE_BITSET_TRAITS(ExportFlags)
+LLVM_YAML_DECLARE_BITSET_TRAITS(PublicSymFlags)
 LLVM_YAML_DECLARE_BITSET_TRAITS(LocalSymFlags)
 LLVM_YAML_DECLARE_BITSET_TRAITS(ProcSymFlags)
 LLVM_YAML_DECLARE_BITSET_TRAITS(FrameProcedureOptions)
@@ -93,6 +94,14 @@ void ScalarBitSetTraits<ExportFlags>::bitset(IO &io, ExportFlags &Flags) {
   }
 }
 
+void ScalarBitSetTraits<PublicSymFlags>::bitset(IO &io, PublicSymFlags &Flags) {
+  auto FlagNames = getProcSymFlagNames();
+  for (const auto &E : FlagNames) {
+    io.bitSetCase(Flags, E.Name.str().c_str(),
+                  static_cast<PublicSymFlags>(E.Value));
+  }
+}
+
 void ScalarBitSetTraits<LocalSymFlags>::bitset(IO &io, LocalSymFlags &Flags) {
   auto FlagNames = getLocalFlagNames();
   for (const auto &E : FlagNames) {
@@ -277,16 +286,15 @@ template <> void SymbolRecordImpl<ExportSym>::map(IO &IO) {
 }
 
 template <> void SymbolRecordImpl<ProcSym>::map(IO &IO) {
-  // TODO: Print the linkage name
-
-  IO.mapRequired("PtrParent", Symbol.Parent);
-  IO.mapRequired("PtrEnd", Symbol.End);
-  IO.mapRequired("PtrNext", Symbol.Next);
+  IO.mapOptional("PtrParent", Symbol.Parent, 0U);
+  IO.mapOptional("PtrEnd", Symbol.End, 0U);
+  IO.mapOptional("PtrNext", Symbol.Next, 0U);
   IO.mapRequired("CodeSize", Symbol.CodeSize);
   IO.mapRequired("DbgStart", Symbol.DbgStart);
   IO.mapRequired("DbgEnd", Symbol.DbgEnd);
   IO.mapRequired("FunctionType", Symbol.FunctionType);
-  IO.mapRequired("Segment", Symbol.Segment);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("Flags", Symbol.Flags);
   IO.mapRequired("DisplayName", Symbol.Name);
 }
@@ -298,9 +306,9 @@ template <> void SymbolRecordImpl<RegisterSym>::map(IO &IO) {
 }
 
 template <> void SymbolRecordImpl<PublicSym32>::map(IO &IO) {
-  IO.mapRequired("Type", Symbol.Index);
-  IO.mapRequired("Seg", Symbol.Segment);
-  IO.mapRequired("Off", Symbol.Offset);
+  IO.mapRequired("Flags", Symbol.Flags);
+  IO.mapOptional("Offset", Symbol.Offset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("Name", Symbol.Name);
 }
 
@@ -316,8 +324,8 @@ template <> void SymbolRecordImpl<EnvBlockSym>::map(IO &IO) {
 }
 
 template <> void SymbolRecordImpl<InlineSiteSym>::map(IO &IO) {
-  IO.mapRequired("PtrParent", Symbol.Parent);
-  IO.mapRequired("PtrEnd", Symbol.End);
+  IO.mapOptional("PtrParent", Symbol.Parent, 0U);
+  IO.mapOptional("PtrEnd", Symbol.End, 0U);
   IO.mapRequired("Inlinee", Symbol.Inlinee);
   // TODO: The binary annotations
 }
@@ -359,17 +367,17 @@ template <> void SymbolRecordImpl<DefRangeRegisterRelSym>::map(IO &IO) {
 }
 
 template <> void SymbolRecordImpl<BlockSym>::map(IO &IO) {
-  // TODO: Print the linkage name
-  IO.mapRequired("PtrParent", Symbol.Parent);
-  IO.mapRequired("PtrEnd", Symbol.End);
+  IO.mapOptional("PtrParent", Symbol.Parent, 0U);
+  IO.mapOptional("PtrEnd", Symbol.End, 0U);
   IO.mapRequired("CodeSize", Symbol.CodeSize);
-  IO.mapRequired("Segment", Symbol.Segment);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("BlockName", Symbol.Name);
 }
 
 template <> void SymbolRecordImpl<LabelSym>::map(IO &IO) {
-  // TODO: Print the linkage name
-  IO.mapRequired("Segment", Symbol.Segment);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("Flags", Symbol.Flags);
   IO.mapRequired("Flags", Symbol.Flags);
   IO.mapRequired("DisplayName", Symbol.Name);
@@ -419,8 +427,8 @@ template <> void SymbolRecordImpl<FrameProcSym>::map(IO &IO) {
 }
 
 template <> void SymbolRecordImpl<CallSiteInfoSym>::map(IO &IO) {
-  // TODO: Map Linkage Name
-  IO.mapRequired("Segment", Symbol.Segment);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("Type", Symbol.Type);
 }
 
@@ -432,14 +440,13 @@ template <> void SymbolRecordImpl<FileStaticSym>::map(IO &IO) {
 }
 
 template <> void SymbolRecordImpl<HeapAllocationSiteSym>::map(IO &IO) {
-  // TODO: Map Linkage Name
-  IO.mapRequired("Segment", Symbol.Segment);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("CallInstructionSize", Symbol.CallInstructionSize);
   IO.mapRequired("Type", Symbol.Type);
 }
 
 template <> void SymbolRecordImpl<FrameCookieSym>::map(IO &IO) {
-  // TODO: Map Linkage Name
   IO.mapRequired("Register", Symbol.Register);
   IO.mapRequired("CookieKind", Symbol.CookieKind);
   IO.mapRequired("Flags", Symbol.Flags);
@@ -478,14 +485,16 @@ template <> void SymbolRecordImpl<ConstantSym>::map(IO &IO) {
 }
 
 template <> void SymbolRecordImpl<DataSym>::map(IO &IO) {
-  // TODO: Map linkage name
   IO.mapRequired("Type", Symbol.Type);
+  IO.mapOptional("Offset", Symbol.DataOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("DisplayName", Symbol.Name);
 }
 
 template <> void SymbolRecordImpl<ThreadLocalDataSym>::map(IO &IO) {
-  // TODO: Map linkage name
   IO.mapRequired("Type", Symbol.Type);
+  IO.mapOptional("Offset", Symbol.DataOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("DisplayName", Symbol.Name);
 }
 }
diff --git a/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/lib/ObjectYAML/CodeViewYAMLTypes.cpp
index a03b9cd50faa2..2d1cb4b1b27b9 100644
--- a/lib/ObjectYAML/CodeViewYAMLTypes.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLTypes.cpp
@@ -138,7 +138,7 @@ StringRef ScalarTraits<TypeIndex>::input(StringRef Scalar, void *Ctx,
 
 void ScalarTraits<APSInt>::output(const APSInt &S, void *,
                                   llvm::raw_ostream &OS) {
-  S.print(OS, true);
+  S.print(OS, S.isSigned());
 }
 
 StringRef ScalarTraits<APSInt>::input(StringRef Scalar, void *Ctx, APSInt &S) {
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
index 353d027f4e111..65703c6cf683a 100644
--- a/lib/ObjectYAML/WasmYAML.cpp
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -47,14 +47,22 @@ static void commonSectionMapping(IO &IO, WasmYAML::Section &Section) {
   IO.mapOptional("Relocations", Section.Relocations);
 }
 
+static void sectionMapping(IO &IO, WasmYAML::NameSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Name", Section.Name);
+  IO.mapOptional("FunctionNames", Section.FunctionNames);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::LinkingSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Name", Section.Name);
+  IO.mapRequired("SymbolInfo", Section.SymbolInfos);
+}
+
 static void sectionMapping(IO &IO, WasmYAML::CustomSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapRequired("Name", Section.Name);
-  if (Section.Name == "name") {
-    IO.mapOptional("FunctionNames", Section.FunctionNames);
-  } else {
-    IO.mapRequired("Payload", Section.Payload);
-  }
+  IO.mapRequired("Payload", Section.Payload);
 }
 
 static void sectionMapping(IO &IO, WasmYAML::TypeSection &Section) {
@@ -121,11 +129,29 @@ void MappingTraits<std::unique_ptr<WasmYAML::Section>>::mapping(
     IO.mapRequired("Type", SectionType);
 
   switch (SectionType) {
-  case wasm::WASM_SEC_CUSTOM:
-    if (!IO.outputting())
-      Section.reset(new WasmYAML::CustomSection());
-    sectionMapping(IO, *cast<WasmYAML::CustomSection>(Section.get()));
+  case wasm::WASM_SEC_CUSTOM: {
+    StringRef SectionName;
+    if (IO.outputting()) {
+      auto CustomSection = cast<WasmYAML::CustomSection>(Section.get());
+      SectionName = CustomSection->Name;
+    } else {
+      IO.mapRequired("Name", SectionName);
+    }
+    if (SectionName == "linking") {
+      if (!IO.outputting())
+        Section.reset(new WasmYAML::LinkingSection());
+      sectionMapping(IO, *cast<WasmYAML::LinkingSection>(Section.get()));
+    } else if (SectionName == "name") {
+      if (!IO.outputting())
+        Section.reset(new WasmYAML::NameSection());
+      sectionMapping(IO, *cast<WasmYAML::NameSection>(Section.get()));
+    } else {
+      if (!IO.outputting())
+        Section.reset(new WasmYAML::CustomSection(SectionName));
+      sectionMapping(IO, *cast<WasmYAML::CustomSection>(Section.get()));
+    }
     break;
+  }
   case wasm::WASM_SEC_TYPE:
     if (!IO.outputting())
       Section.reset(new WasmYAML::TypeSection());
@@ -321,6 +347,12 @@ void MappingTraits<WasmYAML::DataSegment>::mapping(
   IO.mapRequired("Content", Segment.Content);
 }
 
+void MappingTraits<WasmYAML::SymbolInfo>::mapping(IO &IO,
+                                                  WasmYAML::SymbolInfo &Info) {
+  IO.mapRequired("Name", Info.Name);
+  IO.mapRequired("Flags", Info.Flags);
+}
+
 void ScalarEnumerationTraits<WasmYAML::ValueType>::enumeration(
     IO &IO, WasmYAML::ValueType &Type) {
 #define ECase(X) IO.enumCase(Type, #X, wasm::WASM_TYPE_##X);
diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp
index 52a81ff0e1594..acb9e8d015bce 100644
--- a/lib/Option/OptTable.cpp
+++ b/lib/Option/OptTable.cpp
@@ -194,6 +194,37 @@ static unsigned matchOption(const OptTable::Info *I, StringRef Str,
   return 0;
 }
 
+// Returns true if one of the Prefixes + In.Names matches Option
+static bool optionMatches(const OptTable::Info &In, StringRef Option) {
+  if (In.Values && In.Prefixes)
+    for (size_t I = 0; In.Prefixes[I]; I++)
+      if (Option == std::string(In.Prefixes[I]) + In.Name)
+        return true;
+  return false;
+}
+
+// This function is for flag value completion.
+// Eg. When "-stdlib=" and "l" was passed to this function, it will return
+// appropiriate values for stdlib, which starts with l.
+std::vector<std::string>
+OptTable::suggestValueCompletions(StringRef Option, StringRef Arg) const {
+  // Search all options and return possible values.
+  for (const Info &In : OptionInfos.slice(FirstSearchableIndex)) {
+    if (!optionMatches(In, Option))
+      continue;
+
+    SmallVector<StringRef, 8> Candidates;
+    StringRef(In.Values).split(Candidates, ",", -1, false);
+
+    std::vector<std::string> Result;
+    for (StringRef Val : Candidates)
+      if (Val.startswith(Arg))
+        Result.push_back(Val);
+    return Result;
+  }
+  return {};
+}
+
 std::vector<std::string> OptTable::findByPrefix(StringRef Cur) const {
   std::vector<std::string> Ret;
   for (const Info &In : OptionInfos.slice(FirstSearchableIndex)) {
@@ -336,6 +367,9 @@ static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) {
   case Option::FlagClass:
     break;
 
+  case Option::ValuesClass:
+    break;
+
   case Option::SeparateClass: case Option::JoinedOrSeparateClass:
   case Option::RemainingArgsClass: case Option::RemainingArgsJoinedClass:
     Name += ' ';
diff --git a/lib/Option/Option.cpp b/lib/Option/Option.cpp
index 4832e659f026d..bf9f040bde525 100644
--- a/lib/Option/Option.cpp
+++ b/lib/Option/Option.cpp
@@ -47,6 +47,7 @@ void Option::print(raw_ostream &O) const {
     P(UnknownClass);
     P(FlagClass);
     P(JoinedClass);
+    P(ValuesClass);
     P(SeparateClass);
     P(CommaJoinedClass);
     P(MultiArgClass);
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index afd66f55720a5..78d5ea955e644 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -464,10 +464,15 @@ static void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
   if (RunProfileGen) {
     MPM.addPass(PGOInstrumentationGen());
 
+    FunctionPassManager FPM;
+    FPM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass()));
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+
     // Add the profile lowering pass.
     InstrProfOptions Options;
     if (!ProfileGenFile.empty())
       Options.InstrProfileOutput = ProfileGenFile;
+    Options.DoCounterPromotion = true;
     MPM.addPass(InstrProfiling(Options));
   }
 
@@ -923,9 +928,6 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   MainFPM.add(AlignmentFromAssumptionsPass());
 #endif
 
-  // FIXME: Conditionally run LoadCombine here, after it's ported
-  // (in case we still have this pass, given its questionable usefulness).
-
   // FIXME: add peephole extensions to the PM here.
   MainFPM.addPass(InstCombinePass());
   MainFPM.addPass(JumpThreadingPass());
diff --git a/lib/ProfileData/Coverage/CoverageMapping.cpp b/lib/ProfileData/Coverage/CoverageMapping.cpp
index 015b3c6c2021d..4534e086b39e2 100644
--- a/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -1,4 +1,4 @@
-//===- CoverageMapping.cpp - Code coverage mapping support ------*- C++ -*-===//
+//===- CoverageMapping.cpp - Code coverage mapping support ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -200,6 +200,9 @@ Error CoverageMapping::loadFunctionRecord(
     const CoverageMappingRecord &Record,
     IndexedInstrProfReader &ProfileReader) {
   StringRef OrigFuncName = Record.FunctionName;
+  if (OrigFuncName.empty())
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+
   if (Record.Filenames.empty())
     OrigFuncName = getFuncNameWithoutPrefix(OrigFuncName);
   else
@@ -300,8 +303,8 @@ namespace {
 /// An instantiation set is a collection of functions that have the same source
 /// code, ie, template functions specializations.
 class FunctionInstantiationSetCollector {
-  typedef DenseMap<std::pair<unsigned, unsigned>,
-                   std::vector<const FunctionRecord *>> MapT;
+  using MapT = DenseMap<std::pair<unsigned, unsigned>,
+                        std::vector<const FunctionRecord *>>;
   MapT InstantiatedFunctions;
 
 public:
@@ -315,7 +318,6 @@ public:
   }
 
   MapT::iterator begin() { return InstantiatedFunctions.begin(); }
-
   MapT::iterator end() { return InstantiatedFunctions.end(); }
 };
 
diff --git a/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index a34f359cd5427..fff0a03ccbe01 100644
--- a/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -1,4 +1,4 @@
-//===- CoverageMappingReader.cpp - Code coverage mapping reader -*- C++ -*-===//
+//===- CoverageMappingReader.cpp - Code coverage mapping reader -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -62,7 +62,7 @@ void CoverageMappingIterator::increment() {
 }
 
 Error RawCoverageReader::readULEB128(uint64_t &Result) {
-  if (Data.size() < 1)
+  if (Data.empty())
     return make_error<CoverageMapError>(coveragemap_error::truncated);
   unsigned N = 0;
   Result = decodeULEB128(reinterpret_cast<const uint8_t *>(Data.data()), &N);
@@ -392,9 +392,9 @@ struct CovMapFuncRecordReader {
 // A class for reading coverage mapping function records for a module.
 template <CovMapVersion Version, class IntPtrT, support::endianness Endian>
 class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
-  typedef typename CovMapTraits<
-      Version, IntPtrT>::CovMapFuncRecordType FuncRecordType;
-  typedef typename CovMapTraits<Version, IntPtrT>::NameRefType  NameRefType;
+  using FuncRecordType =
+      typename CovMapTraits<Version, IntPtrT>::CovMapFuncRecordType;
+  using NameRefType = typename CovMapTraits<Version, IntPtrT>::NameRefType;
 
   // Maps function's name references to the indexes of their records
   // in \c Records.
@@ -419,6 +419,8 @@ class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
       StringRef FuncName;
       if (Error Err = CFR->template getFuncName<Endian>(ProfileNames, FuncName))
         return Err;
+      if (FuncName.empty())
+        return make_error<InstrProfError>(instrprof_error::malformed);
       Records.emplace_back(Version, FuncName, FuncHash, Mapping, FilenamesBegin,
                            Filenames.size() - FilenamesBegin);
       return Error::success();
@@ -574,7 +576,7 @@ static Error loadTestingFormat(StringRef Data, InstrProfSymtab &ProfileNames,
   Endian = support::endianness::little;
 
   Data = Data.substr(StringRef(TestingFormatMagic).size());
-  if (Data.size() < 1)
+  if (Data.empty())
     return make_error<CoverageMapError>(coveragemap_error::truncated);
   unsigned N = 0;
   auto ProfileNamesSize =
@@ -582,7 +584,7 @@ static Error loadTestingFormat(StringRef Data, InstrProfSymtab &ProfileNames,
   if (N > Data.size())
     return make_error<CoverageMapError>(coveragemap_error::malformed);
   Data = Data.substr(N);
-  if (Data.size() < 1)
+  if (Data.empty())
     return make_error<CoverageMapError>(coveragemap_error::truncated);
   N = 0;
   uint64_t Address =
@@ -596,7 +598,7 @@ static Error loadTestingFormat(StringRef Data, InstrProfSymtab &ProfileNames,
     return E;
   CoverageMapping = Data.substr(ProfileNamesSize);
   // Skip the padding bytes because coverage map data has an alignment of 8.
-  if (CoverageMapping.size() < 1)
+  if (CoverageMapping.empty())
     return make_error<CoverageMapError>(coveragemap_error::truncated);
   size_t Pad = alignmentAdjustment(CoverageMapping.data(), 8);
   if (CoverageMapping.size() < Pad)
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index c9b82c303e338..005061c4f0680 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -330,14 +330,15 @@ GlobalVariable *createPGOFuncNameVar(Function &F, StringRef PGOFuncName) {
   return createPGOFuncNameVar(*F.getParent(), F.getLinkage(), PGOFuncName);
 }
 
-void InstrProfSymtab::create(Module &M, bool InLTO) {
+Error InstrProfSymtab::create(Module &M, bool InLTO) {
   for (Function &F : M) {
     // Function may not have a name: like using asm("") to overwrite the name.
     // Ignore in this case.
     if (!F.hasName())
       continue;
     const std::string &PGOFuncName = getPGOFuncName(F, InLTO);
-    addFuncName(PGOFuncName);
+    if (Error E = addFuncName(PGOFuncName))
+      return E;
     MD5FuncMap.emplace_back(Function::getGUID(PGOFuncName), &F);
     // In ThinLTO, local function may have been promoted to global and have
     // suffix added to the function name. We need to add the stripped function
@@ -346,13 +347,15 @@ void InstrProfSymtab::create(Module &M, bool InLTO) {
       auto pos = PGOFuncName.find('.');
       if (pos != std::string::npos) {
         const std::string &OtherFuncName = PGOFuncName.substr(0, pos);
-        addFuncName(OtherFuncName);
+        if (Error E = addFuncName(OtherFuncName))
+          return E;
         MD5FuncMap.emplace_back(Function::getGUID(OtherFuncName), &F);
       }
     }
   }
 
   finalizeSymtab();
+  return Error::success();
 }
 
 Error collectPGOFuncNameStrings(ArrayRef<std::string> NameStrs,
@@ -447,7 +450,8 @@ Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
     SmallVector<StringRef, 0> Names;
     NameStrings.split(Names, getInstrProfNameSeparator());
     for (StringRef &Name : Names)
-      Symtab.addFuncName(Name);
+      if (Error E = Symtab.addFuncName(Name))
+        return E;
 
     while (P < EndP && *P == 0)
       P++;
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index d9f599f400da5..1ed1fb8b6f0b5 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -200,7 +200,8 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
         std::pair<StringRef, StringRef> VD = Line->rsplit(':');
         uint64_t TakenCount, Value;
         if (ValueKind == IPVK_IndirectCallTarget) {
-          Symtab->addFuncName(VD.first);
+          if (Error E = Symtab->addFuncName(VD.first))
+            return E;
           Value = IndexedInstrProf::ComputeHash(VD.first);
         } else {
           READ_NUM(VD.first, Value);
@@ -232,7 +233,8 @@ Error TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
 
   // Read the function name.
   Record.Name = *Line++;
-  Symtab->addFuncName(Record.Name);
+  if (Error E = Symtab->addFuncName(Record.Name))
+    return E;
 
   // Read the function hash.
   if (Line.is_at_end())
@@ -482,8 +484,8 @@ InstrProfLookupTrait::ComputeHash(StringRef K) {
   return IndexedInstrProf::ComputeHash(HashType, K);
 }
 
-typedef InstrProfLookupTrait::data_type data_type;
-typedef InstrProfLookupTrait::offset_type offset_type;
+using data_type = InstrProfLookupTrait::data_type;
+using offset_type = InstrProfLookupTrait::offset_type;
 
 bool InstrProfLookupTrait::readValueProfilingData(
     const unsigned char *&D, const unsigned char *const End) {
@@ -620,7 +622,7 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
     for (unsigned I = 0; I < SummarySize / sizeof(uint64_t); I++)
       Dst[I] = endian::byte_swap<uint64_t, little>(Src[I]);
 
-    llvm::SummaryEntryVector DetailedSummary;
+    SummaryEntryVector DetailedSummary;
     for (unsigned I = 0; I < SummaryData->NumCutoffEntries; I++) {
       const IndexedInstrProf::Summary::Entry &Ent = SummaryData->getEntry(I);
       DetailedSummary.emplace_back((uint32_t)Ent.Cutoff, Ent.MinBlockCount,
@@ -694,7 +696,9 @@ InstrProfSymtab &IndexedInstrProfReader::getSymtab() {
     return *Symtab.get();
 
   std::unique_ptr<InstrProfSymtab> NewSymtab = make_unique<InstrProfSymtab>();
-  Index->populateSymtab(*NewSymtab.get());
+  if (Error E = Index->populateSymtab(*NewSymtab.get())) {
+    consumeError(error(InstrProfError::take(std::move(E))));
+  }
 
   Symtab = std::move(NewSymtab);
   return *Symtab.get();
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index b3402a6ea956c..9efea78ed2a89 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -69,8 +69,7 @@ public:
           write(P[K].D[I]);
       }
     } else {
-      raw_string_ostream &SOStream =
-          static_cast<llvm::raw_string_ostream &>(OS);
+      raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
       std::string &Data = SOStream.str(); // with flush
       for (int K = 0; K < NItems; K++) {
         for (int I = 0; I < P[K].N; I++) {
@@ -91,14 +90,14 @@ public:
 
 class InstrProfRecordWriterTrait {
 public:
-  typedef StringRef key_type;
-  typedef StringRef key_type_ref;
+  using key_type = StringRef;
+  using key_type_ref = StringRef;
 
-  typedef const InstrProfWriter::ProfilingData *const data_type;
-  typedef const InstrProfWriter::ProfilingData *const data_type_ref;
+  using data_type = const InstrProfWriter::ProfilingData *const;
+  using data_type_ref = const InstrProfWriter::ProfilingData *const;
 
-  typedef uint64_t hash_value_type;
-  typedef uint64_t offset_type;
+  using hash_value_type = uint64_t;
+  using offset_type = uint64_t;
 
   support::endianness ValueProfDataEndianness = support::little;
   InstrProfSummaryBuilder *SummaryBuilder;
@@ -363,17 +362,19 @@ void InstrProfWriter::writeRecordInText(const InstrProfRecord &Func,
   OS << "\n";
 }
 
-void InstrProfWriter::writeText(raw_fd_ostream &OS) {
+Error InstrProfWriter::writeText(raw_fd_ostream &OS) {
   if (ProfileKind == PF_IRLevel)
     OS << "# IR level Instrumentation Flag\n:ir\n";
   InstrProfSymtab Symtab;
   for (const auto &I : FunctionData)
     if (shouldEncodeData(I.getValue()))
-      Symtab.addFuncName(I.getKey());
+      if (Error E = Symtab.addFuncName(I.getKey()))
+        return E;
   Symtab.finalizeSymtab();
 
   for (const auto &I : FunctionData)
     if (shouldEncodeData(I.getValue()))
       for (const auto &Func : I.getValue())
         writeRecordInText(Func.second, Symtab, OS);
+  return Error::success();
 }
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index f36c25a0ce914..deb76cb565d1e 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -37,10 +37,6 @@
 
 using namespace llvm;
 
-// TODO: Remove these and use APInt qualified types directly.
-typedef APInt::WordType integerPart;
-const unsigned int integerPartWidth = APInt::APINT_BITS_PER_WORD;
-
 /// A macro used to combine two fcCategory enums into one key which can be used
 /// in a switch statement to classify how the interaction of two APFloat's
 /// categories affects an operation.
@@ -51,7 +47,7 @@ const unsigned int integerPartWidth = APInt::APINT_BITS_PER_WORD;
 
 /* Assumed in hexadecimal significand parsing, and conversion to
    hexadecimal strings.  */
-static_assert(integerPartWidth % 4 == 0, "Part width must be divisible by 4!");
+static_assert(APFloatBase::integerPartWidth % 4 == 0, "Part width must be divisible by 4!");
 
 namespace llvm {
   /* Represents floating point arithmetic semantics.  */
@@ -153,8 +149,7 @@ namespace llvm {
   const unsigned int maxExponent = 16383;
   const unsigned int maxPrecision = 113;
   const unsigned int maxPowerOfFiveExponent = maxExponent + maxPrecision - 1;
-  const unsigned int maxPowerOfFiveParts = 2 + ((maxPowerOfFiveExponent * 815)
-                                                / (351 * integerPartWidth));
+  const unsigned int maxPowerOfFiveParts = 2 + ((maxPowerOfFiveExponent * 815) / (351 * APFloatBase::integerPartWidth));
 
   unsigned int APFloatBase::semanticsPrecision(const fltSemantics &semantics) {
     return semantics.precision;
@@ -180,7 +175,7 @@ namespace llvm {
 static inline unsigned int
 partCountForBits(unsigned int bits)
 {
-  return ((bits) + integerPartWidth - 1) / integerPartWidth;
+  return ((bits) + APFloatBase::integerPartWidth - 1) / APFloatBase::integerPartWidth;
 }
 
 /* Returns 0U-9U.  Return values >= 10U are not digits.  */
@@ -420,7 +415,7 @@ trailingHexadecimalFraction(StringRef::iterator p, StringRef::iterator end,
 /* Return the fraction lost were a bignum truncated losing the least
    significant BITS bits.  */
 static lostFraction
-lostFractionThroughTruncation(const integerPart *parts,
+lostFractionThroughTruncation(const APFloatBase::integerPart *parts,
                               unsigned int partCount,
                               unsigned int bits)
 {
@@ -433,7 +428,7 @@ lostFractionThroughTruncation(const integerPart *parts,
     return lfExactlyZero;
   if (bits == lsb + 1)
     return lfExactlyHalf;
-  if (bits <= partCount * integerPartWidth &&
+  if (bits <= partCount * APFloatBase::integerPartWidth &&
       APInt::tcExtractBit(parts, bits - 1))
     return lfMoreThanHalf;
 
@@ -442,7 +437,7 @@ lostFractionThroughTruncation(const integerPart *parts,
 
 /* Shift DST right BITS bits noting lost fraction.  */
 static lostFraction
-shiftRight(integerPart *dst, unsigned int parts, unsigned int bits)
+shiftRight(APFloatBase::integerPart *dst, unsigned int parts, unsigned int bits)
 {
   lostFraction lost_fraction;
 
@@ -489,22 +484,22 @@ HUerrBound(bool inexactMultiply, unsigned int HUerr1, unsigned int HUerr2)
 /* The number of ulps from the boundary (zero, or half if ISNEAREST)
    when the least significant BITS are truncated.  BITS cannot be
    zero.  */
-static integerPart
-ulpsFromBoundary(const integerPart *parts, unsigned int bits, bool isNearest)
-{
+static APFloatBase::integerPart
+ulpsFromBoundary(const APFloatBase::integerPart *parts, unsigned int bits,
+                 bool isNearest) {
   unsigned int count, partBits;
-  integerPart part, boundary;
+  APFloatBase::integerPart part, boundary;
 
   assert(bits != 0);
 
   bits--;
-  count = bits / integerPartWidth;
-  partBits = bits % integerPartWidth + 1;
+  count = bits / APFloatBase::integerPartWidth;
+  partBits = bits % APFloatBase::integerPartWidth + 1;
 
-  part = parts[count] & (~(integerPart) 0 >> (integerPartWidth - partBits));
+  part = parts[count] & (~(APFloatBase::integerPart) 0 >> (APFloatBase::integerPartWidth - partBits));
 
   if (isNearest)
-    boundary = (integerPart) 1 << (partBits - 1);
+    boundary = (APFloatBase::integerPart) 1 << (partBits - 1);
   else
     boundary = 0;
 
@@ -518,32 +513,30 @@ ulpsFromBoundary(const integerPart *parts, unsigned int bits, bool isNearest)
   if (part == boundary) {
     while (--count)
       if (parts[count])
-        return ~(integerPart) 0; /* A lot.  */
+        return ~(APFloatBase::integerPart) 0; /* A lot.  */
 
     return parts[0];
   } else if (part == boundary - 1) {
     while (--count)
       if (~parts[count])
-        return ~(integerPart) 0; /* A lot.  */
+        return ~(APFloatBase::integerPart) 0; /* A lot.  */
 
     return -parts[0];
   }
 
-  return ~(integerPart) 0; /* A lot.  */
+  return ~(APFloatBase::integerPart) 0; /* A lot.  */
 }
 
 /* Place pow(5, power) in DST, and return the number of parts used.
    DST must be at least one part larger than size of the answer.  */
 static unsigned int
-powerOf5(integerPart *dst, unsigned int power)
-{
-  static const integerPart firstEightPowers[] = { 1, 5, 25, 125, 625, 3125,
-                                                  15625, 78125 };
-  integerPart pow5s[maxPowerOfFiveParts * 2 + 5];
+powerOf5(APFloatBase::integerPart *dst, unsigned int power) {
+  static const APFloatBase::integerPart firstEightPowers[] = { 1, 5, 25, 125, 625, 3125, 15625, 78125 };
+  APFloatBase::integerPart pow5s[maxPowerOfFiveParts * 2 + 5];
   pow5s[0] = 78125 * 5;
 
   unsigned int partsCount[16] = { 1 };
-  integerPart scratch[maxPowerOfFiveParts], *p1, *p2, *pow5;
+  APFloatBase::integerPart scratch[maxPowerOfFiveParts], *p1, *p2, *pow5;
   unsigned int result;
   assert(power <= maxExponent);
 
@@ -572,7 +565,7 @@ powerOf5(integerPart *dst, unsigned int power)
     }
 
     if (power & 1) {
-      integerPart *tmp;
+      APFloatBase::integerPart *tmp;
 
       APInt::tcFullMultiply(p2, p1, pow5, result, pc);
       result += pc;
@@ -608,14 +601,14 @@ static const char NaNU[] = "NAN";
    significant nibble.  Write out exactly COUNT hexdigits, return
    COUNT.  */
 static unsigned int
-partAsHex (char *dst, integerPart part, unsigned int count,
+partAsHex (char *dst, APFloatBase::integerPart part, unsigned int count,
            const char *hexDigitChars)
 {
   unsigned int result = count;
 
-  assert(count != 0 && count <= integerPartWidth / 4);
+  assert(count != 0 && count <= APFloatBase::integerPartWidth / 4);
 
-  part >>= (integerPartWidth - 4 * count);
+  part >>= (APFloatBase::integerPartWidth - 4 * count);
   while (count--) {
     dst[count] = hexDigitChars[part & 0xf];
     part >>= 4;
@@ -889,11 +882,11 @@ unsigned int IEEEFloat::partCount() const {
   return partCountForBits(semantics->precision + 1);
 }
 
-const integerPart *IEEEFloat::significandParts() const {
+const IEEEFloat::integerPart *IEEEFloat::significandParts() const {
   return const_cast<IEEEFloat *>(this)->significandParts();
 }
 
-integerPart *IEEEFloat::significandParts() {
+IEEEFloat::integerPart *IEEEFloat::significandParts() {
   if (partCount() > 1)
     return significand.parts;
   else
@@ -916,7 +909,7 @@ void IEEEFloat::incrementSignificand() {
 }
 
 /* Add the significand of the RHS.  Returns the carry flag.  */
-integerPart IEEEFloat::addSignificand(const IEEEFloat &rhs) {
+IEEEFloat::integerPart IEEEFloat::addSignificand(const IEEEFloat &rhs) {
   integerPart *parts;
 
   parts = significandParts();
@@ -929,8 +922,8 @@ integerPart IEEEFloat::addSignificand(const IEEEFloat &rhs) {
 
 /* Subtract the significand of the RHS with a borrow flag.  Returns
    the borrow flag.  */
-integerPart IEEEFloat::subtractSignificand(const IEEEFloat &rhs,
-                                           integerPart borrow) {
+IEEEFloat::integerPart IEEEFloat::subtractSignificand(const IEEEFloat &rhs,
+                                                      integerPart borrow) {
   integerPart *parts;
 
   parts = significandParts();
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index e9716e3b1e872..c558ddd82161d 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -546,10 +546,7 @@ unsigned APInt::countLeadingZerosSlowCase() const {
   return Count;
 }
 
-unsigned APInt::countLeadingOnes() const {
-  if (isSingleWord())
-    return llvm::countLeadingOnes(U.VAL << (APINT_BITS_PER_WORD - BitWidth));
-
+unsigned APInt::countLeadingOnesSlowCase() const {
   unsigned highWordBits = BitWidth % APINT_BITS_PER_WORD;
   unsigned shift;
   if (!highWordBits) {
@@ -573,9 +570,7 @@ unsigned APInt::countLeadingOnes() const {
   return Count;
 }
 
-unsigned APInt::countTrailingZeros() const {
-  if (isSingleWord())
-    return std::min(unsigned(llvm::countTrailingZeros(U.VAL)), BitWidth);
+unsigned APInt::countTrailingZerosSlowCase() const {
   unsigned Count = 0;
   unsigned i = 0;
   for (; i < getNumWords() && U.pVal[i] == 0; ++i)
diff --git a/lib/Support/BinaryStreamReader.cpp b/lib/Support/BinaryStreamReader.cpp
index bfb658cfa0b74..e00527f2519e1 100644
--- a/lib/Support/BinaryStreamReader.cpp
+++ b/lib/Support/BinaryStreamReader.cpp
@@ -109,6 +109,12 @@ Error BinaryStreamReader::readStreamRef(BinaryStreamRef &Ref, uint32_t Length) {
   return Error::success();
 }
 
+Error BinaryStreamReader::readSubstream(BinarySubstreamRef &Stream,
+                                        uint32_t Size) {
+  Stream.Offset = getOffset();
+  return readStreamRef(Stream.StreamData, Size);
+}
+
 Error BinaryStreamReader::skip(uint32_t Amount) {
   if (Amount > bytesRemaining())
     return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
diff --git a/lib/Support/CachePruning.cpp b/lib/Support/CachePruning.cpp
index aca1236395655..60d0964f27646 100644
--- a/lib/Support/CachePruning.cpp
+++ b/lib/Support/CachePruning.cpp
@@ -82,7 +82,7 @@ llvm::parseCachePruningPolicy(StringRef PolicyStr) {
       if (Value.back() != '%')
         return make_error<StringError>("'" + Value + "' must be a percentage",
                                        inconvertibleErrorCode());
-      StringRef SizeStr = Value.slice(0, Value.size() - 1);
+      StringRef SizeStr = Value.drop_back();
       uint64_t Size;
       if (SizeStr.getAsInteger(0, Size))
         return make_error<StringError>("'" + SizeStr + "' not an integer",
@@ -91,7 +91,28 @@ llvm::parseCachePruningPolicy(StringRef PolicyStr) {
         return make_error<StringError>("'" + SizeStr +
                                            "' must be between 0 and 100",
                                        inconvertibleErrorCode());
-      Policy.PercentageOfAvailableSpace = Size;
+      Policy.MaxSizePercentageOfAvailableSpace = Size;
+    } else if (Key == "cache_size_bytes") {
+      uint64_t Mult = 1;
+      switch (tolower(Value.back())) {
+      case 'k':
+        Mult = 1024;
+        Value = Value.drop_back();
+        break;
+      case 'm':
+        Mult = 1024 * 1024;
+        Value = Value.drop_back();
+        break;
+      case 'g':
+        Mult = 1024 * 1024 * 1024;
+        Value = Value.drop_back();
+        break;
+      }
+      uint64_t Size;
+      if (Value.getAsInteger(0, Size))
+        return make_error<StringError>("'" + Value + "' not an integer",
+                                       inconvertibleErrorCode());
+      Policy.MaxSizeBytes = Size * Mult;
     } else {
       return make_error<StringError>("Unknown key: '" + Key + "'",
                                      inconvertibleErrorCode());
@@ -115,11 +136,12 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
   if (!isPathDir)
     return false;
 
-  Policy.PercentageOfAvailableSpace =
-      std::min(Policy.PercentageOfAvailableSpace, 100u);
+  Policy.MaxSizePercentageOfAvailableSpace =
+      std::min(Policy.MaxSizePercentageOfAvailableSpace, 100u);
 
   if (Policy.Expiration == seconds(0) &&
-      Policy.PercentageOfAvailableSpace == 0) {
+      Policy.MaxSizePercentageOfAvailableSpace == 0 &&
+      Policy.MaxSizeBytes == 0) {
     DEBUG(dbgs() << "No pruning settings set, exit early\n");
     // Nothing will be pruned, early exit
     return false;
@@ -157,7 +179,8 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
     writeTimestampFile(TimestampFile);
   }
 
-  bool ShouldComputeSize = (Policy.PercentageOfAvailableSpace > 0);
+  bool ShouldComputeSize =
+      (Policy.MaxSizePercentageOfAvailableSpace > 0 || Policy.MaxSizeBytes > 0);
 
   // Keep track of space
   std::set<std::pair<uint64_t, std::string>> FileSizes;
@@ -216,14 +239,22 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
     }
     sys::fs::space_info SpaceInfo = ErrOrSpaceInfo.get();
     auto AvailableSpace = TotalSize + SpaceInfo.free;
-    auto FileAndSize = FileSizes.rbegin();
+
+    if (Policy.MaxSizePercentageOfAvailableSpace == 0)
+      Policy.MaxSizePercentageOfAvailableSpace = 100;
+    if (Policy.MaxSizeBytes == 0)
+      Policy.MaxSizeBytes = AvailableSpace;
+    auto TotalSizeTarget = std::min<uint64_t>(
+        AvailableSpace * Policy.MaxSizePercentageOfAvailableSpace / 100ull,
+        Policy.MaxSizeBytes);
+
     DEBUG(dbgs() << "Occupancy: " << ((100 * TotalSize) / AvailableSpace)
-                 << "% target is: " << Policy.PercentageOfAvailableSpace
-                 << "\n");
+                 << "% target is: " << Policy.MaxSizePercentageOfAvailableSpace
+                 << "%, " << Policy.MaxSizeBytes << " bytes\n");
+
+    auto FileAndSize = FileSizes.rbegin();
     // Remove the oldest accessed files first, till we get below the threshold
-    while (((100 * TotalSize) / AvailableSpace) >
-               Policy.PercentageOfAvailableSpace &&
-           FileAndSize != FileSizes.rend()) {
+    while (TotalSize > TotalSizeTarget && FileAndSize != FileSizes.rend()) {
       // Remove the file.
       sys::fs::remove(FileAndSize->second);
       // Update size
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index de0ca940b405f..0345a5e3d2a1d 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
@@ -1522,13 +1523,9 @@ bool parser<unsigned long long>::parse(Option &O, StringRef ArgName,
 // parser<double>/parser<float> implementation
 //
 static bool parseDouble(Option &O, StringRef Arg, double &Value) {
-  SmallString<32> TmpStr(Arg.begin(), Arg.end());
-  const char *ArgStart = TmpStr.c_str();
-  char *End;
-  Value = strtod(ArgStart, &End);
-  if (*End != 0)
-    return O.error("'" + Arg + "' value invalid for floating point argument!");
-  return false;
+  if (to_float(Arg, Value))
+    return false;
+  return O.error("'" + Arg + "' value invalid for floating point argument!");
 }
 
 bool parser<double>::parse(Option &O, StringRef ArgName, StringRef Arg,
diff --git a/lib/Support/DataExtractor.cpp b/lib/Support/DataExtractor.cpp
index 53c10bcc562e3..0199b300ba72d 100644
--- a/lib/Support/DataExtractor.cpp
+++ b/lib/Support/DataExtractor.cpp
@@ -68,6 +68,13 @@ uint16_t *DataExtractor::getU16(uint32_t *offset_ptr, uint16_t *dst,
                         Data.data());
 }
 
+uint32_t DataExtractor::getU24(uint32_t *offset_ptr) const {
+  uint24_t ExtractedVal =
+      getU<uint24_t>(offset_ptr, this, IsLittleEndian, Data.data());
+  // The 3 bytes are in the correct byte order for the host.
+  return ExtractedVal.getAsUint32(sys::IsLittleEndianHost);
+}
+
 uint32_t DataExtractor::getU32(uint32_t *offset_ptr) const {
   return getU<uint32_t>(offset_ptr, this, IsLittleEndian, Data.data());
 }
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index f70b77da8de47..e04bd8bb3b9a1 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -1,4 +1,4 @@
-//===-- GraphWriter.cpp - Implements GraphWriter support routines ---------===//
+//===- GraphWriter.cpp - Implements GraphWriter support routines ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,10 +12,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <system_error>
+#include <string>
+#include <vector>
+
 using namespace llvm;
 
 static cl::opt<bool> ViewBackground("view-background", cl::Hidden,
@@ -99,8 +111,10 @@ static bool ExecGraphViewer(StringRef ExecPath, std::vector<const char *> &args,
 }
 
 namespace {
+
 struct GraphSession {
   std::string LogBuffer;
+
   bool TryFindProgram(StringRef Names, std::string &ProgramPath) {
     raw_string_ostream Log(LogBuffer);
     SmallVector<StringRef, 8> parts;
@@ -115,7 +129,8 @@ struct GraphSession {
     return false;
   }
 };
-} // namespace
+
+} // end anonymous namespace
 
 static const char *getProgramName(GraphProgram::Name program) {
   switch (program) {
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 320aede79fbb0..2687a67556d3e 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -174,6 +174,7 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   switch (Kind) {
   case UnknownOS: return "unknown";
 
+  case Ananas: return "ananas";
   case CloudABI: return "cloudabi";
   case Darwin: return "darwin";
   case DragonFly: return "dragonfly";
@@ -455,6 +456,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
 
 static Triple::OSType parseOS(StringRef OSName) {
   return StringSwitch<Triple::OSType>(OSName)
+    .StartsWith("ananas", Triple::Ananas)
     .StartsWith("cloudabi", Triple::CloudABI)
     .StartsWith("darwin", Triple::Darwin)
     .StartsWith("dragonfly", Triple::DragonFly)
diff --git a/lib/Support/Unix/Host.inc b/lib/Support/Unix/Host.inc
index 457217125a222..0ba6a25aa198d 100644
--- a/lib/Support/Unix/Host.inc
+++ b/lib/Support/Unix/Host.inc
@@ -45,5 +45,11 @@ std::string sys::getDefaultTargetTriple() {
     TargetTripleString += getOSVersion();
   }
 
+  // Override the default target with an environment variable named by LLVM_TARGET_TRIPLE_ENV.
+#if defined(LLVM_TARGET_TRIPLE_ENV)
+  if (const char *EnvTriple = std::getenv(LLVM_TARGET_TRIPLE_ENV))
+    TargetTripleString = EnvTriple;
+#endif
+
   return Triple::normalize(TargetTripleString);
 }
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index edbc7938f0cbf..dd39ef935bf92 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -195,6 +195,10 @@ Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
 #if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
   void *pa = ::mmap(start, PageSize*NumPages, PROT_READ|PROT_EXEC,
                     flags, fd, 0);
+#elif defined(__NetBSD__) && defined(PROT_MPROTECT)
+  void *pa =
+      ::mmap(start, PageSize * NumPages,
+             PROT_READ | PROT_WRITE | PROT_MPROTECT(PROT_EXEC), flags, fd, 0);
 #else
   void *pa = ::mmap(start, PageSize*NumPages, PROT_READ|PROT_WRITE|PROT_EXEC,
                     flags, fd, 0);
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index 2df0eaff47e52..1704fa4799428 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -449,11 +449,22 @@ bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<co
   size_t ArgLength = Program.size() + 1;
   for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end();
        I != E; ++I) {
-    ArgLength += strlen(*I) + 1;
+    size_t length = strlen(*I);
+
+    // Ensure that we do not exceed the MAX_ARG_STRLEN constant on Linux, which
+    // does not have a constant unlike what the man pages would have you
+    // believe. Since this limit is pretty high, perform the check
+    // unconditionally rather than trying to be aggressive and limiting it to
+    // Linux only.
+    if (length >= (32 * 4096))
+      return false;
+
+    ArgLength += length + 1;
     if (ArgLength > size_t(HalfArgMax)) {
       return false;
     }
   }
+
   return true;
 }
 }
diff --git a/lib/Support/Windows/Host.inc b/lib/Support/Windows/Host.inc
index fe89fe0aad8c4..7e196cf0ce18a 100644
--- a/lib/Support/Windows/Host.inc
+++ b/lib/Support/Windows/Host.inc
@@ -18,5 +18,13 @@
 using namespace llvm;
 
 std::string sys::getDefaultTargetTriple() {
-  return Triple::normalize(LLVM_DEFAULT_TARGET_TRIPLE);
+  const char *Triple = LLVM_DEFAULT_TARGET_TRIPLE;
+
+  // Override the default target with an environment variable named by LLVM_TARGET_TRIPLE_ENV.
+#if defined(LLVM_TARGET_TRIPLE_ENV)
+  if (const char *EnvTriple = std::getenv(LLVM_TARGET_TRIPLE_ENV))
+    Triple = EnvTriple;
+#endif
+
+  return Triple::normalize(Triple);
 }
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index 01ae3214453dc..e2f21a56a810a 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -1,4 +1,4 @@
-//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
+//===- YAMLParser.cpp - Simple YAML parser --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,15 +13,29 @@
 
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/ADT/AllocatorList.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
 
 using namespace llvm;
 using namespace yaml;
@@ -37,7 +51,7 @@ enum UnicodeEncodingForm {
 
 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
 ///                it exists. Length is in {0, 2, 3, 4}.
-typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
+using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>;
 
 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
 ///                      encoding form of \a Input.
@@ -46,7 +60,7 @@ typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
 ///          and how long the byte order mark is if one exists.
 static EncodingInfo getUnicodeEncoding(StringRef Input) {
-  if (Input.size() == 0)
+  if (Input.empty())
     return std::make_pair(UEF_Unknown, 0);
 
   switch (uint8_t(Input[0])) {
@@ -95,8 +109,6 @@ static EncodingInfo getUnicodeEncoding(StringRef Input) {
   return std::make_pair(UEF_UTF8, 0);
 }
 
-namespace llvm {
-namespace yaml {
 /// Pin the vtables to this file.
 void Node::anchor() {}
 void NullNode::anchor() {}
@@ -107,6 +119,9 @@ void MappingNode::anchor() {}
 void SequenceNode::anchor() {}
 void AliasNode::anchor() {}
 
+namespace llvm {
+namespace yaml {
+
 /// Token - A single YAML token.
 struct Token {
   enum TokenKind {
@@ -133,7 +148,7 @@ struct Token {
     TK_Alias,
     TK_Anchor,
     TK_Tag
-  } Kind;
+  } Kind = TK_Error;
 
   /// A string of length 0 or more whose begin() points to the logical location
   /// of the token in the input.
@@ -142,14 +157,16 @@ struct Token {
   /// The value of a block scalar node.
   std::string Value;
 
-  Token() : Kind(TK_Error) {}
+  Token() = default;
 };
-}
-}
 
-typedef llvm::BumpPtrList<Token> TokenQueueT;
+} // end namespace yaml
+} // end namespace llvm
+
+using TokenQueueT = BumpPtrList<Token>;
 
 namespace {
+
 /// @brief This struct is used to track simple keys.
 ///
 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
@@ -170,12 +187,13 @@ struct SimpleKey {
     return Tok == Other.Tok;
   }
 };
-}
+
+} // end anonymous namespace
 
 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
 ///        subsequence and the subsequence's length in code units (uint8_t).
 ///        A length of 0 represents an error.
-typedef std::pair<uint32_t, unsigned> UTF8Decoded;
+using UTF8Decoded = std::pair<uint32_t, unsigned>;
 
 static UTF8Decoded decodeUTF8(StringRef Range) {
   StringRef::iterator Position= Range.begin();
@@ -229,6 +247,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
 
 namespace llvm {
 namespace yaml {
+
 /// @brief Scans YAML tokens from a MemoryBuffer.
 class Scanner {
 public:
@@ -350,7 +369,8 @@ private:
   ///          ns-char.
   StringRef::iterator skip_ns_char(StringRef::iterator Position);
 
-  typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
+  using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator);
+
   /// @brief Skip minimal well-formed code unit subsequences until Func
   ///        returns its input.
   ///
@@ -655,10 +675,10 @@ bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
 }
 
 bool yaml::scanTokens(StringRef Input) {
-  llvm::SourceMgr SM;
-  llvm::yaml::Scanner scanner(Input, SM);
-  for (;;) {
-    llvm::yaml::Token T = scanner.getNext();
+  SourceMgr SM;
+  Scanner scanner(Input, SM);
+  while (true) {
+    Token T = scanner.getNext();
     if (T.Kind == Token::TK_StreamEnd)
       break;
     else if (T.Kind == Token::TK_Error)
@@ -1744,7 +1764,7 @@ Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors,
                std::error_code *EC)
     : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {}
 
-Stream::~Stream() {}
+Stream::~Stream() = default;
 
 bool Stream::failed() { return scanner->failed(); }
 
@@ -1851,8 +1871,6 @@ bool Node::failed() const {
   return Doc->failed();
 }
 
-
-
 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
   // TODO: Handle newlines properly. We need to remove leading whitespace.
   if (Value[0] == '"') { // Double quoted.
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index c410b1d560860..601084f9eae3c 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -8,17 +8,27 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/YAMLTraits.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cctype>
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
 #include <cstring>
+#include <string>
+#include <vector>
+
 using namespace llvm;
 using namespace yaml;
 
@@ -26,11 +36,9 @@ using namespace yaml;
 //  IO
 //===----------------------------------------------------------------------===//
 
-IO::IO(void *Context) : Ctxt(Context) {
-}
+IO::IO(void *Context) : Ctxt(Context) {}
 
-IO::~IO() {
-}
+IO::~IO() = default;
 
 void *IO::getContext() {
   return Ctxt;
@@ -46,15 +54,13 @@ void IO::setContext(void *Context) {
 
 Input::Input(StringRef InputContent, void *Ctxt,
              SourceMgr::DiagHandlerTy DiagHandler, void *DiagHandlerCtxt)
-    : IO(Ctxt), Strm(new Stream(InputContent, SrcMgr, false, &EC)),
-      CurrentNode(nullptr) {
+    : IO(Ctxt), Strm(new Stream(InputContent, SrcMgr, false, &EC)) {
   if (DiagHandler)
     SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt);
   DocIterator = Strm->begin();
 }
 
-Input::~Input() {
-}
+Input::~Input() = default;
 
 std::error_code Input::error() { return EC; }
 
@@ -398,13 +404,9 @@ bool Input::canElideEmptySequence() {
 //===----------------------------------------------------------------------===//
 
 Output::Output(raw_ostream &yout, void *context, int WrapColumn)
-    : IO(context), Out(yout), WrapColumn(WrapColumn), Column(0),
-      ColumnAtFlowStart(0), ColumnAtMapFlowStart(0), NeedBitValueComma(false),
-      NeedFlowSequenceComma(false), EnumerationMatchFound(false),
-      NeedsNewLine(false), WriteDefaultValues(false) {}
+    : IO(context), Out(yout), WrapColumn(WrapColumn) {}
 
-Output::~Output() {
-}
+Output::~Output() = default;
 
 bool Output::outputting() {
   return true;
@@ -911,12 +913,9 @@ void ScalarTraits<double>::output(const double &Val, void *, raw_ostream &Out) {
 }
 
 StringRef ScalarTraits<double>::input(StringRef Scalar, void *, double &Val) {
-  SmallString<32> buff(Scalar.begin(), Scalar.end());
-  char *end;
-  Val = strtod(buff.c_str(), &end);
-  if (*end != '\0')
-    return "invalid floating point number";
-  return StringRef();
+  if (to_float(Scalar, Val))
+    return StringRef();
+  return "invalid floating point number";
 }
 
 void ScalarTraits<float>::output(const float &Val, void *, raw_ostream &Out) {
@@ -924,12 +923,9 @@ void ScalarTraits<float>::output(const float &Val, void *, raw_ostream &Out) {
 }
 
 StringRef ScalarTraits<float>::input(StringRef Scalar, void *, float &Val) {
-  SmallString<32> buff(Scalar.begin(), Scalar.end());
-  char *end;
-  Val = strtod(buff.c_str(), &end);
-  if (*end != '\0')
-    return "invalid floating point number";
-  return StringRef();
+  if (to_float(Scalar, Val))
+    return StringRef();
+  return "invalid floating point number";
 }
 
 void ScalarTraits<Hex8>::output(const Hex8 &Val, void *, raw_ostream &Out) {
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 1abc8ed8683d5..9480cd46d28fc 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -548,7 +548,11 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
   pos += Size;
 
 #ifndef LLVM_ON_WIN32
+#if defined(__linux__)
+  bool ShouldWriteInChunks = true;
+#else
   bool ShouldWriteInChunks = false;
+#endif
 #else
   // Writing a large size of output to Windows console returns ENOMEM. It seems
   // that, prior to Windows 8, WriteFile() is redirecting to WriteConsole(), and
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 3e0e3978b90b5..37b9690d0434a 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -31,6 +31,7 @@ class MachineFunctionPass;
 
 FunctionPass *createAArch64DeadRegisterDefinitions();
 FunctionPass *createAArch64RedundantCopyEliminationPass();
+FunctionPass *createAArch64CondBrTuning();
 FunctionPass *createAArch64ConditionalCompares();
 FunctionPass *createAArch64AdvSIMDScalar();
 FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
@@ -55,6 +56,7 @@ void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
 void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
 void initializeAArch64CollectLOHPass(PassRegistry&);
+void initializeAArch64CondBrTuningPass(PassRegistry &);
 void initializeAArch64ConditionalComparesPass(PassRegistry&);
 void initializeAArch64ConditionOptimizerPass(PassRegistry&);
 void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
new file mode 100644
index 0000000000000..f27bc97ec3f3e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -0,0 +1,336 @@
+//===-- AArch64CondBrTuning.cpp --- Conditional branch tuning for AArch64 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file contains a pass that transforms CBZ/CBNZ/TBZ/TBNZ instructions
+/// into a conditional branch (B.cond), when the NZCV flags can be set for
+/// "free".  This is preferred on targets that have more flexibility when
+/// scheduling B.cond instructions as compared to CBZ/CBNZ/TBZ/TBNZ (assuming
+/// all other variables are equal).  This can also reduce register pressure.
+///
+/// A few examples:
+///
+/// 1) add w8, w0, w1  -> cmn w0, w1             ; CMN is an alias of ADDS.
+///    cbz w8, .LBB_2  -> b.eq .LBB0_2
+///
+/// 2) add w8, w0, w1  -> adds w8, w0, w1        ; w8 has multiple uses.
+///    cbz w8, .LBB1_2 -> b.eq .LBB1_2
+///
+/// 3) sub w8, w0, w1       -> subs w8, w0, w1   ; w8 has multiple uses.
+///    tbz w8, #31, .LBB6_2 -> b.ge .LBB6_2
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-cond-br-tuning"
+#define AARCH64_CONDBR_TUNING_NAME "AArch64 Conditional Branch Tuning"
+
+namespace {
+class AArch64CondBrTuning : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  MachineRegisterInfo *MRI;
+
+public:
+  static char ID;
+  AArch64CondBrTuning() : MachineFunctionPass(ID) {
+    initializeAArch64CondBrTuningPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override { return AARCH64_CONDBR_TUNING_NAME; }
+
+private:
+  MachineInstr *getOperandDef(const MachineOperand &MO);
+  MachineInstr *convertToFlagSetting(MachineInstr &MI, bool IsFlagSetting);
+  MachineInstr *convertToCondBr(MachineInstr &MI);
+  bool tryToTuneBranch(MachineInstr &MI, MachineInstr &DefMI);
+};
+} // end anonymous namespace
+
+char AArch64CondBrTuning::ID = 0;
+
+INITIALIZE_PASS(AArch64CondBrTuning, "aarch64-cond-br-tuning",
+                AARCH64_CONDBR_TUNING_NAME, false, false)
+
+void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) {
+  if (!TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    return nullptr;
+  return MRI->getUniqueVRegDef(MO.getReg());
+}
+
+MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI,
+                                                        bool IsFlagSetting) {
+  // If this is already the flag setting version of the instruction (e.g., SUBS)
+  // just make sure the implicit-def of NZCV isn't marked dead.
+  if (IsFlagSetting) {
+    for (unsigned I = MI.getNumExplicitOperands(), E = MI.getNumOperands();
+         I != E; ++I) {
+      MachineOperand &MO = MI.getOperand(I);
+      if (MO.isReg() && MO.isDead() && MO.getReg() == AArch64::NZCV)
+        MO.setIsDead(false);
+    }
+    return &MI;
+  }
+  bool Is64Bit;
+  unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit);
+  unsigned NewDestReg = MI.getOperand(0).getReg();
+  if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg()))
+    NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                    TII->get(NewOpc), NewDestReg);
+  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
+    MIB.add(MI.getOperand(I));
+
+  return MIB;
+}
+
+MachineInstr *AArch64CondBrTuning::convertToCondBr(MachineInstr &MI) {
+  AArch64CC::CondCode CC;
+  MachineBasicBlock *TargetMBB = TII->getBranchDestBlock(MI);
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+    CC = AArch64CC::EQ;
+    break;
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    CC = AArch64CC::NE;
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+    CC = AArch64CC::GE;
+    break;
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    CC = AArch64CC::LT;
+    break;
+  }
+  return BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::Bcc))
+      .addImm(CC)
+      .addMBB(TargetMBB);
+}
+
+bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
+                                          MachineInstr &DefMI) {
+  // We don't want NZCV bits live across blocks.
+  if (MI.getParent() != DefMI.getParent())
+    return false;
+
+  bool IsFlagSetting = true;
+  unsigned MIOpc = MI.getOpcode();
+  MachineInstr *NewCmp = nullptr, *NewBr = nullptr;
+  switch (DefMI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::ADDWri:
+  case AArch64::ADDWrr:
+  case AArch64::ADDWrs:
+  case AArch64::ADDWrx:
+  case AArch64::ANDWri:
+  case AArch64::ANDWrr:
+  case AArch64::ANDWrs:
+  case AArch64::BICWrr:
+  case AArch64::BICWrs:
+  case AArch64::SUBWri:
+  case AArch64::SUBWrr:
+  case AArch64::SUBWrs:
+  case AArch64::SUBWrx:
+    IsFlagSetting = false;
+  case AArch64::ADDSWri:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSWrx:
+  case AArch64::ANDSWri:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSWrs:
+  case AArch64::BICSWrr:
+  case AArch64::BICSWrs:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSWrx:
+    switch (MIOpc) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+    case AArch64::TBZW:
+    case AArch64::TBNZW:
+      // Check to see if the TBZ/TBNZ is checking the sign bit.
+      if ((MIOpc == AArch64::TBZW || MIOpc == AArch64::TBNZW) &&
+          MI.getOperand(1).getImm() != 31)
+        return false;
+
+      // There must not be any instruction between DefMI and MI that clobbers or
+      // reads NZCV.
+      MachineBasicBlock::iterator I(DefMI), E(MI);
+      for (I = std::next(I); I != E; ++I) {
+        if (I->modifiesRegister(AArch64::NZCV, TRI) ||
+            I->readsRegister(AArch64::NZCV, TRI))
+          return false;
+      }
+      DEBUG(dbgs() << "  Replacing instructions:\n    ");
+      DEBUG(DefMI.print(dbgs()));
+      DEBUG(dbgs() << "    ");
+      DEBUG(MI.print(dbgs()));
+
+      NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
+      NewBr = convertToCondBr(MI);
+      break;
+    }
+    break;
+
+  case AArch64::ADDXri:
+  case AArch64::ADDXrr:
+  case AArch64::ADDXrs:
+  case AArch64::ADDXrx:
+  case AArch64::ANDXri:
+  case AArch64::ANDXrr:
+  case AArch64::ANDXrs:
+  case AArch64::BICXrr:
+  case AArch64::BICXrs:
+  case AArch64::SUBXri:
+  case AArch64::SUBXrr:
+  case AArch64::SUBXrs:
+  case AArch64::SUBXrx:
+    IsFlagSetting = false;
+  case AArch64::ADDSXri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDSXrx:
+  case AArch64::ANDSXri:
+  case AArch64::ANDSXrr:
+  case AArch64::ANDSXrs:
+  case AArch64::BICSXrr:
+  case AArch64::BICSXrs:
+  case AArch64::SUBSXri:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBSXrx:
+    switch (MIOpc) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+    case AArch64::TBZX:
+    case AArch64::TBNZX: {
+      // Check to see if the TBZ/TBNZ is checking the sign bit.
+      if ((MIOpc == AArch64::TBZX || MIOpc == AArch64::TBNZX) &&
+          MI.getOperand(1).getImm() != 63)
+        return false;
+      // There must not be any instruction between DefMI and MI that clobbers or
+      // reads NZCV.
+      MachineBasicBlock::iterator I(DefMI), E(MI);
+      for (I = std::next(I); I != E; ++I) {
+        if (I->modifiesRegister(AArch64::NZCV, TRI) ||
+            I->readsRegister(AArch64::NZCV, TRI))
+          return false;
+      }
+      DEBUG(dbgs() << "  Replacing instructions:\n    ");
+      DEBUG(DefMI.print(dbgs()));
+      DEBUG(dbgs() << "    ");
+      DEBUG(MI.print(dbgs()));
+
+      NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
+      NewBr = convertToCondBr(MI);
+      break;
+    }
+    }
+    break;
+  }
+  assert(NewCmp && NewBr && "Expected new instructions.");
+
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(NewCmp->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(NewBr->print(dbgs()));
+
+  // If this was a flag setting version of the instruction, we use the original
+  // instruction by just clearing the dead marked on the implicit-def of NCZV.
+  // Therefore, we should not erase this instruction.
+  if (!IsFlagSetting)
+    DefMI.eraseFromParent();
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AArch64CondBrTuning::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  DEBUG(dbgs() << "********** AArch64 Conditional Branch Tuning  **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+
+  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    bool LocalChange = false;
+    for (MachineBasicBlock::iterator I = MBB.getFirstTerminator(),
+                                     E = MBB.end();
+         I != E; ++I) {
+      MachineInstr &MI = *I;
+      switch (MI.getOpcode()) {
+      default:
+        break;
+      case AArch64::CBZW:
+      case AArch64::CBZX:
+      case AArch64::CBNZW:
+      case AArch64::CBNZX:
+      case AArch64::TBZW:
+      case AArch64::TBZX:
+      case AArch64::TBNZW:
+      case AArch64::TBNZX:
+        MachineInstr *DefMI = getOperandDef(MI.getOperand(0));
+        LocalChange = (DefMI && tryToTuneBranch(MI, *DefMI));
+        break;
+      }
+      // If the optimization was successful, we can't optimize any other
+      // branches because doing so would clobber the NZCV flags.
+      if (LocalChange) {
+        Changed = true;
+        break;
+      }
+    }
+  }
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64CondBrTuning() {
+  return new AArch64CondBrTuning();
+}
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 544f67433fd53..ee54550c9900b 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -13,7 +13,9 @@
 
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -84,6 +86,51 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
       DEBUG(dbgs() << "    Ignoring, XZR or WZR already used by the instruction\n");
       continue;
     }
+    if (MF.getSubtarget<AArch64Subtarget>().hasLSE()) {
+      // XZ/WZ for LSE can only be used when acquire semantics are not used,
+      // LDOPAL WZ is an invalid opcode.
+      switch (MI.getOpcode()) {
+      case AArch64::CASALb:
+      case AArch64::CASALh:
+      case AArch64::CASALs:
+      case AArch64::CASALd:
+      case AArch64::SWPALb:
+      case AArch64::SWPALh:
+      case AArch64::SWPALs:
+      case AArch64::SWPALd:
+      case AArch64::LDADDALb:
+      case AArch64::LDADDALh:
+      case AArch64::LDADDALs:
+      case AArch64::LDADDALd:
+      case AArch64::LDEORALb:
+      case AArch64::LDEORALh:
+      case AArch64::LDEORALs:
+      case AArch64::LDEORALd:
+      case AArch64::LDSETALb:
+      case AArch64::LDSETALh:
+      case AArch64::LDSETALs:
+      case AArch64::LDSETALd:
+      case AArch64::LDSMINALb:
+      case AArch64::LDSMINALh:
+      case AArch64::LDSMINALs:
+      case AArch64::LDSMINALd:
+      case AArch64::LDSMAXALb:
+      case AArch64::LDSMAXALh:
+      case AArch64::LDSMAXALs:
+      case AArch64::LDSMAXALd:
+      case AArch64::LDUMINALb:
+      case AArch64::LDUMINALh:
+      case AArch64::LDUMINALs:
+      case AArch64::LDUMINALd:
+      case AArch64::LDUMAXALb:
+      case AArch64::LDUMAXALh:
+      case AArch64::LDUMAXALs:
+      case AArch64::LDUMAXALd:
+        continue;
+      default:
+        break;
+      }
+    }
     const MCInstrDesc &Desc = MI.getDesc();
     for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) {
       MachineOperand &MO = MI.getOperand(I);
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 8c2c0a564c302..04687847c1a30 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -201,7 +201,7 @@ private:
 
   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
 
-  void SelectCMP_SWAP(SDNode *N);
+  bool SelectCMP_SWAP(SDNode *N);
 
 };
 } // end anonymous namespace
@@ -2609,9 +2609,13 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
 }
 
 /// We've got special pseudo-instructions for these
-void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   unsigned Opcode;
   EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+
+  // Leave IR for LSE if subtarget supports it.
+  if (Subtarget->hasLSE()) return false;
+
   if (MemTy == MVT::i8)
     Opcode = AArch64::CMP_SWAP_8;
   else if (MemTy == MVT::i16)
@@ -2637,6 +2641,8 @@ void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
   ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
   CurDAG->RemoveDeadNode(N);
+
+  return true;
 }
 
 void AArch64DAGToDAGISel::Select(SDNode *Node) {
@@ -2660,8 +2666,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case ISD::ATOMIC_CMP_SWAP:
-    SelectCMP_SWAP(Node);
-    return;
+    if (SelectCMP_SWAP(Node))
+      return;
+    break;
 
   case ISD::READ_REGISTER:
     if (tryReadRegister(Node))
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 083ca2156598f..2965106fd2708 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10563,11 +10563,20 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+  if (Size > 128) return AtomicExpansionKind::None;
+  // Nand not supported in LSE.
+  if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
+  // Currently leaving And and Sub to LLSC
+  if ((AI->getOperation() == AtomicRMWInst::And) || (AI->getOperation() == AtomicRMWInst::Sub))
+    return AtomicExpansionKind::LLSC;
+  // Leave 128 bits to LLSC.
+  return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
 }
 
 bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
+  // If subtarget has LSE, leave cmpxchg intact for codegen.
+  if (Subtarget->hasLSE()) return false;
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
   // implement cmpxchg without spilling. If the address being exchanged is also
   // on the stack and close enough to the spill slot, this can lead to a
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 71826bec6b11f..de283b70210fe 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -405,3 +405,49 @@ def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch),
                           (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
                                GPR64:$newLo, GPR64:$newHi), []>,
                    Sched<[WriteAtomic]>;
+
+// v8.1 Atomic instructions:
+def : Pat<(atomic_load_add_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_or_8 GPR64:$Rn, GPR32:$Rs), (LDSETALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_16 GPR64:$Rn, GPR32:$Rs), (LDSETALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_32 GPR64:$Rn, GPR32:$Rs), (LDSETALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_64 GPR64:$Rn, GPR64:$Rs), (LDSETALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_xor_8 GPR64:$Rn, GPR32:$Rs), (LDEORALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_16 GPR64:$Rn, GPR32:$Rs), (LDEORALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_32 GPR64:$Rn, GPR32:$Rs), (LDEORALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_64 GPR64:$Rn, GPR64:$Rs), (LDEORALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_max_8 GPR64:$Rn, GPR32:$Rs), (LDSMAXALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_16 GPR64:$Rn, GPR32:$Rs), (LDSMAXALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_32 GPR64:$Rn, GPR32:$Rs), (LDSMAXALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_64 GPR64:$Rn, GPR64:$Rs), (LDSMAXALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_umax_8 GPR64:$Rn, GPR32:$Rs), (LDUMAXALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_16 GPR64:$Rn, GPR32:$Rs), (LDUMAXALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_32 GPR64:$Rn, GPR32:$Rs), (LDUMAXALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_64 GPR64:$Rn, GPR64:$Rs), (LDUMAXALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_min_8 GPR64:$Rn, GPR32:$Rs), (LDSMINALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_16 GPR64:$Rn, GPR32:$Rs), (LDSMINALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_32 GPR64:$Rn, GPR32:$Rs), (LDSMINALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_64 GPR64:$Rn, GPR64:$Rs), (LDSMINALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_umin_8 GPR64:$Rn, GPR32:$Rs), (LDUMINALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_16 GPR64:$Rn, GPR32:$Rs), (LDUMINALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_32 GPR64:$Rn, GPR32:$Rs), (LDUMINALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_64 GPR64:$Rn, GPR64:$Rs), (LDUMINALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_cmp_swap_8 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALb GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_16 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALh GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_32 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALs GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_64 GPR64:$Rn, GPR64:$Rold, GPR64:$Rnew), (CASALd GPR64:$Rold, GPR64:$Rnew, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index eea012382150c..314e89bbca863 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1036,7 +1036,7 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) {
 /// \brief Return the opcode that does not set flags when possible - otherwise
 /// return the original opcode. The caller is responsible to do the actual
 /// substitution and legality checking.
-static unsigned convertFlagSettingOpcode(const MachineInstr &MI) {
+static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
   // Don't convert all compare instructions, because for some the zero register
   // encoding becomes the sp register.
   bool MIDefinesZeroReg = false;
@@ -1145,7 +1145,7 @@ bool AArch64InstrInfo::optimizeCompareInstr(
       return true;
     }
     unsigned Opc = CmpInstr.getOpcode();
-    unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
+    unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
     if (NewOpc == Opc)
       return false;
     const MCInstrDesc &MCID = get(NewOpc);
@@ -3318,7 +3318,7 @@ static bool getMaddPatterns(MachineInstr &Root,
     // When NZCV is live bail out.
     if (Cmp_NZCV == -1)
       return false;
-    unsigned NewOpc = convertFlagSettingOpcode(Root);
+    unsigned NewOpc = convertToNonFlagSettingOpc(Root);
     // When opcode can't change bail out.
     // CHECKME: do we miss any cases for opcode conversion?
     if (NewOpc == Opc)
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 59f3405fe439a..58e9ce583d44c 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -119,6 +119,44 @@ public:
     }
   }
 
+  /// \brief Return the opcode that set flags when possible.  The caller is
+  /// responsible for ensuring the opc has a flag setting equivalent.
+  static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit) {
+    switch (Opc) {
+    default:
+      llvm_unreachable("Opcode has no flag setting equivalent!");
+    // 32-bit cases:
+    case AArch64::ADDWri: Is64Bit = false; return AArch64::ADDSWri;
+    case AArch64::ADDWrr: Is64Bit = false; return AArch64::ADDSWrr;
+    case AArch64::ADDWrs: Is64Bit = false; return AArch64::ADDSWrs;
+    case AArch64::ADDWrx: Is64Bit = false; return AArch64::ADDSWrx;
+    case AArch64::ANDWri: Is64Bit = false; return AArch64::ANDSWri;
+    case AArch64::ANDWrr: Is64Bit = false; return AArch64::ANDSWrr;
+    case AArch64::ANDWrs: Is64Bit = false; return AArch64::ANDSWrs;
+    case AArch64::BICWrr: Is64Bit = false; return AArch64::BICSWrr;
+    case AArch64::BICWrs: Is64Bit = false; return AArch64::BICSWrs;
+    case AArch64::SUBWri: Is64Bit = false; return AArch64::SUBSWri;
+    case AArch64::SUBWrr: Is64Bit = false; return AArch64::SUBSWrr;
+    case AArch64::SUBWrs: Is64Bit = false; return AArch64::SUBSWrs;
+    case AArch64::SUBWrx: Is64Bit = false; return AArch64::SUBSWrx;
+    // 64-bit cases:
+    case AArch64::ADDXri: Is64Bit = true; return AArch64::ADDSXri;
+    case AArch64::ADDXrr: Is64Bit = true; return AArch64::ADDSXrr;
+    case AArch64::ADDXrs: Is64Bit = true; return AArch64::ADDSXrs;
+    case AArch64::ADDXrx: Is64Bit = true; return AArch64::ADDSXrx;
+    case AArch64::ANDXri: Is64Bit = true; return AArch64::ANDSXri;
+    case AArch64::ANDXrr: Is64Bit = true; return AArch64::ANDSXrr;
+    case AArch64::ANDXrs: Is64Bit = true; return AArch64::ANDSXrs;
+    case AArch64::BICXrr: Is64Bit = true; return AArch64::BICSXrr;
+    case AArch64::BICXrs: Is64Bit = true; return AArch64::BICSXrs;
+    case AArch64::SUBXri: Is64Bit = true; return AArch64::SUBSXri;
+    case AArch64::SUBXrr: Is64Bit = true; return AArch64::SUBSXrr;
+    case AArch64::SUBXrs: Is64Bit = true; return AArch64::SUBSXrs;
+    case AArch64::SUBXrx: Is64Bit = true; return AArch64::SUBSXrx;
+    }
+  }
+
+
   /// Return true if this is a load/store that can be potentially paired/merged.
   bool isCandidateToMergeOrPair(MachineInstr &MI) const;
 
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 9243eb91cc1ac..005f2d51e4036 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -795,6 +795,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
   int LoadSize = getMemScale(*LoadI);
   int StoreSize = getMemScale(*StoreI);
   unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+  const MachineOperand &StMO = getLdStRegOp(*StoreI);
   unsigned StRt = getLdStRegOp(*StoreI).getReg();
   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
 
@@ -807,7 +808,13 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     // Remove the load, if the destination register of the loads is the same
     // register for stored value.
     if (StRt == LdRt && LoadSize == 8) {
-      StoreI->clearRegisterKills(StRt, TRI);
+      for (MachineInstr &MI : make_range(StoreI->getIterator(),
+                                         LoadI->getIterator())) {
+        if (MI.killsRegister(StRt, TRI)) {
+          MI.clearRegisterKills(StRt, TRI);
+          break;
+        }
+      }
       DEBUG(dbgs() << "Remove load instruction:\n    ");
       DEBUG(LoadI->print(dbgs()));
       DEBUG(dbgs() << "\n");
@@ -819,7 +826,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
-            .addReg(StRt)
+            .add(StMO)
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
   } else {
     // FIXME: Currently we disable this transformation in big-endian targets as
@@ -860,14 +867,14 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
                   DestReg)
-              .addReg(StRt)
+              .add(StMO)
               .addImm(AndMaskEncoded);
     } else {
       BitExtMI =
           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
                   DestReg)
-              .addReg(StRt)
+              .add(StMO)
               .addImm(Immr)
               .addImm(Imms);
     }
@@ -876,7 +883,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
   // Clear kill flags between store and load.
   for (MachineInstr &MI : make_range(StoreI->getIterator(),
                                      BitExtMI->getIterator()))
-    MI.clearRegisterKills(StRt, TRI);
+    if (MI.killsRegister(StRt, TRI)) {
+      MI.clearRegisterKills(StRt, TRI);
+      break;
+    }
 
   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
   DEBUG(StoreI->print(dbgs()));
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index 3b71d529db59b..ccc9d2ad1b482 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -7,37 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// \file This file contains the AArch64 implementation of the DAG scheduling mutation
-// to pair instructions back to back.
+/// \file This file contains the AArch64 implementation of the DAG scheduling
+///  mutation to pair instructions back to back.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64MacroFusion.h"
 #include "AArch64Subtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/CodeGen/MacroFusion.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
-#define DEBUG_TYPE "misched"
-
-STATISTIC(NumFused, "Number of instr pairs fused");
-
 using namespace llvm;
 
-static cl::opt<bool> EnableMacroFusion("aarch64-misched-fusion", cl::Hidden,
-  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
-
 namespace {
 
-/// \brief Verify that the instr pair, FirstMI and SecondMI, should be fused
-/// together.  Given an anchor instr, when the other instr is unspecified, then
-/// check if the anchor instr may be part of a fused pair at all.
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
 static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
                                    const TargetSubtargetInfo &TSI,
                                    const MachineInstr *FirstMI,
-                                   const MachineInstr *SecondMI) {
-  assert((FirstMI || SecondMI) && "At least one instr must be specified");
-
+                                   const MachineInstr &SecondMI) {
   const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
   const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
 
@@ -45,9 +35,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   unsigned FirstOpcode =
     FirstMI ? FirstMI->getOpcode()
 	    : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
-  unsigned SecondOpcode =
-    SecondMI ? SecondMI->getOpcode()
-             : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
 
   if (ST.hasArithmeticBccFusion())
     // Fuse CMN, CMP, TST followed by Bcc.
@@ -128,158 +116,49 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
 
   if (ST.hasFuseAES())
     // Fuse AES crypto operations.
-    switch(FirstOpcode) {
+    switch(SecondOpcode) {
     // AES encode.
-    case AArch64::AESErr:
-      return SecondOpcode == AArch64::AESMCrr ||
-             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    case AArch64::AESMCrr :
+      return FirstOpcode == AArch64::AESErr ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
     // AES decode.
-    case AArch64::AESDrr:
-      return SecondOpcode == AArch64::AESIMCrr ||
-             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    case AArch64::AESIMCrr:
+      return FirstOpcode == AArch64::AESDrr ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
     }
 
   if (ST.hasFuseLiterals())
     // Fuse literal generation operations.
-    switch (FirstOpcode) {
+    switch (SecondOpcode) {
     // PC relative address.
-    case AArch64::ADRP:
-      return SecondOpcode == AArch64::ADDXri ||
-             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    case AArch64::ADDXri:
+      return FirstOpcode == AArch64::ADRP ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
     // 32 bit immediate.
-    case AArch64::MOVZWi:
-      return (SecondOpcode == AArch64::MOVKWi &&
-              SecondMI->getOperand(3).getImm() == 16) ||
-             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
-    // Lower half of 64 bit immediate.
-    case AArch64::MOVZXi:
-      return (SecondOpcode == AArch64::MOVKXi &&
-              SecondMI->getOperand(3).getImm() == 16) ||
-             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
-    // Upper half of 64 bit immediate.
+    case AArch64::MOVKWi:
+      return (FirstOpcode == AArch64::MOVZWi &&
+              SecondMI.getOperand(3).getImm() == 16) ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
+    // Lower and upper half of 64 bit immediate.
     case AArch64::MOVKXi:
-      return FirstMI->getOperand(3).getImm() == 32 &&
-             ((SecondOpcode == AArch64::MOVKXi &&
-               SecondMI->getOperand(3).getImm() == 48) ||
-              SecondOpcode == AArch64::INSTRUCTION_LIST_END);
+      return FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+             (FirstOpcode == AArch64::MOVZXi &&
+              SecondMI.getOperand(3).getImm() == 16) ||
+             (FirstOpcode == AArch64::MOVKXi &&
+              FirstMI->getOperand(3).getImm() == 32 &&
+              SecondMI.getOperand(3).getImm() == 48);
     }
 
   return false;
 }
 
-/// \brief Implement the fusion of instr pairs in the scheduling DAG,
-/// anchored at the instr in AnchorSU..
-static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) {
-  const MachineInstr *AnchorMI = AnchorSU.getInstr();
-  if (!AnchorMI || AnchorMI->isPseudo() || AnchorMI->isTransient())
-    return false;
-
-  // If the anchor instr is the ExitSU, then consider its predecessors;
-  // otherwise, its successors.
-  bool Preds = (&AnchorSU == &DAG->ExitSU);
-  SmallVectorImpl<SDep> &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs;
-
-  const MachineInstr *FirstMI = Preds ? nullptr : AnchorMI;
-  const MachineInstr *SecondMI = Preds ? AnchorMI : nullptr;
-
-  // Check if the anchor instr may be fused.
-  if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
-                              FirstMI, SecondMI))
-    return false;
-
-  // Explorer for fusion candidates among the dependencies of the anchor instr.
-  for (SDep &Dep : AnchorDeps) {
-    // Ignore dependencies that don't enforce ordering.
-    if (Dep.isWeak())
-      continue;
-
-    SUnit &DepSU = *Dep.getSUnit();
-    // Ignore the ExitSU if the dependents are successors.
-    if (!Preds && &DepSU == &DAG->ExitSU)
-      continue;
-
-    const MachineInstr *DepMI = DepSU.getInstr();
-    if (!DepMI || DepMI->isPseudo() || DepMI->isTransient())
-      continue;
-
-    FirstMI = Preds ? DepMI : AnchorMI;
-    SecondMI = Preds ? AnchorMI : DepMI;
-    if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
-                                FirstMI, SecondMI))
-      continue;
-
-    // Create a single weak edge between the adjacent instrs. The only effect is
-    // to cause bottom-up scheduling to heavily prioritize the clustered instrs.
-    SUnit &FirstSU = Preds ? DepSU : AnchorSU;
-    SUnit &SecondSU = Preds ? AnchorSU : DepSU;
-    DAG->addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster));
-
-    // Adjust the latency between the anchor instr and its
-    // predecessors/successors.
-    for (SDep &IDep : AnchorDeps)
-      if (IDep.getSUnit() == &DepSU)
-        IDep.setLatency(0);
-
-    // Adjust the latency between the dependent instr and its
-    // successors/predecessors.
-    for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds)
-      if (IDep.getSUnit() == &AnchorSU)
-        IDep.setLatency(0);
-
-    DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
-          FirstSU.print(dbgs(), DAG); dbgs() << " - ";
-          SecondSU.print(dbgs(), DAG); dbgs() << " /  ";
-          dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " <<
-                    DAG->TII->getName(SecondMI->getOpcode()) << '\n'; );
-
-    if (&SecondSU != &DAG->ExitSU)
-      // Make instructions dependent on FirstSU also dependent on SecondSU to
-      // prevent them from being scheduled between FirstSU and and SecondSU.
-      for (SUnit::const_succ_iterator
-             SI = FirstSU.Succs.begin(), SE = FirstSU.Succs.end();
-           SI != SE; ++SI) {
-        if (!SI->getSUnit() || SI->getSUnit() == &SecondSU)
-          continue;
-        DEBUG(dbgs() << "  Copy Succ ";
-              SI->getSUnit()->print(dbgs(), DAG); dbgs() << '\n';);
-        DAG->addEdge(SI->getSUnit(), SDep(&SecondSU, SDep::Artificial));
-      }
-
-    ++NumFused;
-    return true;
-  }
-
-  return false;
-}
-
-/// \brief Post-process the DAG to create cluster edges between instrs that may
-/// be fused by the processor into a single operation.
-class AArch64MacroFusion : public ScheduleDAGMutation {
-public:
-  AArch64MacroFusion() {}
-
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-
-void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
-  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-
-  // For each of the SUnits in the scheduling block, try to fuse the instr in it
-  // with one in its successors.
-  for (SUnit &ISU : DAG->SUnits)
-    scheduleAdjacentImpl(DAG, ISU);
-
-  // Try to fuse the instr in the ExitSU with one in its predecessors.
-  scheduleAdjacentImpl(DAG, DAG->ExitSU);
-}
-
 } // end namespace
 
 
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () {
-  return EnableMacroFusion ? make_unique<AArch64MacroFusion>() : nullptr;
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent);
 }
 
 } // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64MacroFusion.h b/lib/Target/AArch64/AArch64MacroFusion.h
index e5efedd9fbfd9..32d90d4c40d6f 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.h
+++ b/lib/Target/AArch64/AArch64MacroFusion.h
@@ -2,23 +2,18 @@
 //
 //                     The LLVM Compiler Infrastructure
 //
-// \fileThis file is distributed under the University of Illinois Open Source
+// This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the AArch64 definition of the DAG scheduling mutation
-// to pair instructions back to back.
+/// \file This file contains the AArch64 definition of the DAG scheduling
+/// mutation to pair instructions back to back.
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64InstrInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
-//===----------------------------------------------------------------------===//
-// AArch64MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
 namespace llvm {
 
 /// Note that you have to add:
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 9b3899e0681cf..69124dbd0f838 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -469,10 +469,6 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
         getCopyMapping(DstRB.getID(), SrcRB.getID(), Size),
         /*NumOperands*/ 2);
   }
-  case TargetOpcode::G_SEQUENCE:
-    // FIXME: support this, but the generic code is really not going to do
-    // anything sane.
-    return getInvalidInstructionMapping();
   default:
     break;
   }
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index 303398ea0b7f3..5d1608ef04afa 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// The Cortex-A57 is a traditional superscaler microprocessor with a
+// The Cortex-A57 is a traditional superscalar microprocessor with a
 // conservative 3-wide in-order stage for decode and dispatch. Combined with the
 // much wider out-of-order issue stage, this produced a need to carefully
 // schedule micro-ops so that all three decoded each cycle are successfully
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 3d737402022d8..0aeb1f3e30584 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -32,8 +32,8 @@
 
 //===----------------------------------------------------------------------===//
 // Define 0 micro-op types
-def FalkorWr_none_1cyc : SchedWriteRes<[]> {
-  let Latency = 1;
+def FalkorWr_LdStInc_none_3cyc : SchedWriteRes<[]> {
+  let Latency = 3;
   let NumMicroOps = 0;
 }
 def FalkorWr_none_3cyc : SchedWriteRes<[]> {
@@ -505,7 +505,8 @@ def FalkorWr_4VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
   let NumMicroOps = 12;
 }
 
-// Forwarding logic is modeled for multiply add/accumulate.
+// Forwarding logic is modeled for multiply add/accumulate and
+// load/store base register increment.
 // -----------------------------------------------------------------------------
 def FalkorReadIMA32  : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
 def FalkorReadIMA64  : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
@@ -513,9 +514,13 @@ def FalkorReadVMA    : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr
 def FalkorReadFMA32  : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
 def FalkorReadFMA64  : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
 
+def FalkorReadIncLd  : SchedReadAdvance<2, [FalkorWr_LdStInc_none_3cyc]>;
+def FalkorReadIncSt  : SchedReadAdvance<1, [FalkorWr_LdStInc_none_3cyc]>;
+
 // SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast
 // -----------------------------------------------------------------------------
-def FalkorImmZPred    : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
+def FalkorImmZPred    : SchedPredicate<[{MI->getOperand(1).isImm() &&
+                                         MI->getOperand(1).getImm() == 0}]>;
 def FalkorOp1ZrReg    : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR ||
 
                                          MI->getOperand(1).getReg() == AArch64::XZR}]>;
@@ -770,84 +775,113 @@ def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
 
 // SIMD Load Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1LD_3cyc],                           (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],       (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],                           (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],       (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],                           (instrs LD2i64)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],       (instrs LD2i64_POST)>;
-
-def : InstRW<[FalkorWr_1LD_1VXVY_4cyc],                     (instregex "^LD1i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_1none_3cyc],                     (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc],                     (instregex "^LD2Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc],                     (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_3cyc],                           (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc],                           (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc],                           (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc],                           (instrs LD3i64)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instrs LD3i64_POST)>;
-def : InstRW<[FalkorWr_2LD_3cyc],                           (instrs LD4i64)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instrs LD4i64_POST)>;
-
-def : InstRW<[FalkorWr_1LD_2VXVY_4cyc],                     (instregex "^LD2i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_1none_3cyc],                     (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_1none_3cyc],                     (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_3LD_3cyc],                           (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc],       (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_3LD_3cyc],                           (instrs LD3Threev2d)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc],       (instrs LD3Threev2d_POST)>;
-def : InstRW<[FalkorWr_3LD_3cyc],                           (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc],       (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_3VXVY_4cyc],                     (instregex "^LD3i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2none_3cyc],                     (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_2none_3cyc],                     (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_4LD_3cyc],                           (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc],       (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_4LD_3cyc],                           (instrs LD4Fourv2d)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc],       (instrs LD4Fourv2d_POST)>;
-def : InstRW<[FalkorWr_4LD_3cyc],                           (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc],       (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_4VXVY_4cyc],                     (instregex "^LD4i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc],               (instregex "^LD3Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1none_4cyc],
-                                                            (instregex "^LD3Threev(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc],               (instregex "^LD4Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2none_4cyc],
-                                                            (instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],           (instregex "^LD3Threev(16b|8h|4s)$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],           (instregex "^LD4Fourv(16b|8h|4s)$")>;
-
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc],
-                                                            (instregex "^LD3Threev(16b|8h|4s)_POST$")>;
-
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc],
-                                                            (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instrs LD2i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD2i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd], (instregex "^LD1i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instrs LD3i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD3i64_POST)>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instrs LD4i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD4i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd], (instregex "^LD2i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instrs LD3Threev2d)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD3Threev2d_POST)>;
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd], (instregex "^LD3i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instrs LD4Fourv2d)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD4Fourv2d_POST)>;
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd], (instregex "^LD4i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(8b|4h|2s)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(8b|4h|2s)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(16b|8h|4s)_POST$")>;
+
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
 
 // Arithmetic and Logical Instructions
 // -----------------------------------------------------------------------------
@@ -929,87 +963,105 @@ def : InstRW<[FalkorWr_5VXVY_7cyc],   (instregex "^TBX(v8i8Four|v16i8Four)$")>;
 // SIMD Store Instructions
 // -----------------------------------------------------------------------------
 
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STR(Q|D|S|H|B)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STR(Q|D|S|H|B)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STR(Q|D|S|H|B)(post|pre)$")>;
-def : InstRW<[FalkorWr_STRVro],        (instregex "^STR(D|S|H|B)ro(W|X)$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^STPQi$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_STRVro, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STR(D|S|H|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STPQi$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STPQ(post|pre)$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STP(D|S)(i)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STP(D|S)(i)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STP(D|S)(post|pre)$")>;
-def : InstRW<[FalkorWr_STRQro],        (instregex "^STRQro(W|X)$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STUR(Q|D|S|B|H)i$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instrs STNPDi, STNPSi)>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instrs STNPQi)>;
-
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_STRQro, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STRQro(W|X)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STUR(Q|D|S|B|H)i$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instrs STNPDi, STNPSi)>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instrs STNPQi)>;
+
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>;
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc],
-                                       (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
-
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST3(i8|i16|i32|i64)$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST4(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))_POST$")>;
+
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4(i8|i16|i32|i64)$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST3(i8|i16|i32|i64)_POST$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST4(i8|i16|i32|i64)_POST$")>;
 
-def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc],
-                                       (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>;
+def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3Three(v8b|v4h|v2s)$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc],
-                                       (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3Three(v8b|v4h|v2s)_POST$")>;
 
-def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instrs ST3Threev2d)>;
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instrs ST3Threev2d)>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instrs ST3Threev2d_POST)>;
 
-def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc],
-                                       (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>;
+def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4Four(v8b|v4h|v2s)$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc],
-                                       (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4Four(v8b|v4h|v2s)_POST$")>;
 
-def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instrs ST4Fourv2d)>;
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instrs ST4Fourv2d)>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instrs ST4Fourv2d_POST)>;
 
-def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST3Three(v16b|v8h|v4s)$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST3Three(v16b|v8h|v4s)_POST$")>;
 
-def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST4Four(v16b|v8h|v4s)$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST4Four(v16b|v8h|v4s)_POST$")>;
 
 // Branch Instructions
@@ -1033,22 +1085,25 @@ def : InstRW<[FalkorWr_4VXVY_3cyc],   (instrs SHA256SU1rrr)>;
 
 // FP Load Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                       (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDUR(Q|D|S|H|B)i$")>;
-def : InstRW<[FalkorWr_LDRro],        (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDUR(Q|D|S|H|B)i$")>;
+def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
+                                      (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instrs LDNPQi)>;
-def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instrs LDPQi)>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "LDNP(D|S)i$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "LDP(D|S)i$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "LDP(D|S)(pre|post)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDPQ(pre|post)$")>;
 
 // FP Data Processing Instructions
@@ -1106,31 +1161,41 @@ def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)CVTF(v2i64|v4i32|v2f64|v
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFMui, PRFMl)>;
 def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFUMi)>;
-def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDNP(W|X)i$")>;
-def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDP(W|X)i$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDP(W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR(BB|HH|W|X)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                       (instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_LDRro],        (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR(W|X)l$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDTR(B|H|W|X)i$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDUR(BB|HH|W|X)i$")>;
+def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
+                                      (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDR(W|X)l$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDUR(BB|HH|W|X)i$")>;
 def : InstRW<[FalkorWr_PRFMro],       (instregex "^PRFMro(W|X)$")>;
-def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc],
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
                                       (instrs LDPSWi)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
                                       (instregex "^LDPSW(post|pre)$")>;
-def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc],
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorReadIncLd],
                                       (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
-def : InstRW<[FalkorWr_LDRSro],       (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
-def : InstRW<[FalkorWr_1LD_4cyc],     (instrs LDRSWl)>;
-def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
-def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[FalkorWr_LDRSro, FalkorReadIncLd],
+                                      (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instrs LDRSWl)>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
 
 // Miscellaneous Data-Processing Instructions
 // -----------------------------------------------------------------------------
@@ -1178,32 +1243,46 @@ def : InstRW<[FalkorWr_1none_0cyc],   (instrs BRK, DCPS1, DCPS2, DCPS3, HINT, HL
 def : InstRW<[FalkorWr_1ST_0cyc],     (instrs SYSxt, SYSLxt)>;
 def : InstRW<[FalkorWr_1Z_0cyc],      (instrs MSRpstateImm1, MSRpstateImm4)>;
 
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^(LDAR(B|H|W|X)|LDAXP(W|X)|LDAXR(B|H|W|X)|LDXP(W|X)|LDXR(B|H|W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^(LDAR(B|H|W|X)|LDAXR(B|H|W|X)|LDXR(B|H|W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instregex "^(LDAXP(W|X)|LDXP(W|X))$")>;
 def : InstRW<[FalkorWr_1LD_3cyc],     (instrs MRS, MOVbaseTLS)>;
 
 def : InstRW<[FalkorWr_1LD_1Z_3cyc],  (instrs DRPS)>;
 
 def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs STNPWi, STNPXi)>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instrs STNPWi, STNPXi)>;
 def : InstRW<[FalkorWr_2LD_1Z_3cyc],  (instrs ERET)>;
 
 def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>;
-def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STLR(B|H|W|X)$")>;
-def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXP(W|X)$")>;
-def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXR(B|H|W|X)$")>;
-
-def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXP(W|X)$")>;
-def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXR(B|H|W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STLR(B|H|W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STXP(W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STXR(B|H|W|X)$")>;
+
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STLXP(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STLXR(B|H|W|X)$")>;
 
 // Store Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STP(W|X)i$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc],
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STP(W|X)i$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                           (instregex "^STP(W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STR(BB|HH|W|X)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc],
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                           (instregex "^STR(BB|HH|W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_STRro],            (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STTR(B|H|W|X)i$")>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STUR(BB|HH|W|X)i$")>;
+def : InstRW<[FalkorWr_STRro, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STUR(BB|HH|W|X)i$")>;
 
diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td
index 02cccccd3078c..cf4cdabb8cbfc 100644
--- a/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -1374,7 +1374,9 @@ def KryoWrite_3cyc_LS_LS_400ln :
 	let Latency = 3; let NumMicroOps = 2;
 }
 def : InstRW<[KryoWrite_3cyc_LS_LS_400ln],
-	(instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>;
+	(instregex "LDAX?R(B|H|W|X)")>;
+def : InstRW<[KryoWrite_3cyc_LS_LS_400ln, WriteLDHi],
+	(instregex "LDAXP(W|X)")>;
 def KryoWrite_3cyc_LS_LS_401ln :
 	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
 	let Latency = 3; let NumMicroOps = 2;
@@ -1565,7 +1567,7 @@ def KryoWrite_3cyc_LS_258ln :
 	SchedWriteRes<[KryoUnitLS]> {
 	let Latency = 3; let NumMicroOps = 1;
 }
-def : InstRW<[KryoWrite_3cyc_LS_258ln],
+def : InstRW<[KryoWrite_3cyc_LS_258ln, WriteLDHi],
 	(instregex "LDXP(W|X)")>;
 def KryoWrite_3cyc_LS_258_1ln :
 	SchedWriteRes<[KryoUnitLS]> {
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index d4a8cecdb29f1..6660f0babb8a6 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -47,6 +47,11 @@ static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp",
                                 cl::desc("Enable the CCMP formation pass"),
                                 cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+    EnableCondBrTuning("aarch64-enable-cond-br-tune",
+                       cl::desc("Enable the conditional branch tuning pass"),
+                       cl::init(true), cl::Hidden);
+
 static cl::opt<bool> EnableMCR("aarch64-enable-mcr",
                                cl::desc("Enable the machine combiner pass"),
                                cl::init(true), cl::Hidden);
@@ -429,6 +434,8 @@ bool AArch64PassConfig::addILPOpts() {
     addPass(createAArch64ConditionalCompares());
   if (EnableMCR)
     addPass(&MachineCombinerID);
+  if (EnableCondBrTuning)
+    addPass(createAArch64CondBrTuning());
   if (EnableEarlyIfConversion)
     addPass(&EarlyIfConverterID);
   if (EnableStPairSuppress)
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index f0f50f29be0f3..02b12b5e90ca2 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(AArch64CodeGen
   AArch64AsmPrinter.cpp
   AArch64CleanupLocalDynamicTLSPass.cpp
   AArch64CollectLOH.cpp
+  AArch64CondBrTuning.cpp
   AArch64ConditionalCompares.cpp
   AArch64DeadRegisterDefinitionsPass.cpp
   AArch64ExpandPseudoInsts.cpp
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 43a6fa9ce0896..3d075018904c0 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -43,26 +43,25 @@ public:
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
-      // This table *must* be in the order that the fixup_* kinds are defined in
-      // AArch64FixupKinds.h.
-      //
-      // Name                           Offset (bits) Size (bits)     Flags
-      { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_aarch64_add_imm12", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
-      { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
-      { "fixup_aarch64_movw", 5, 16, 0 },
-      { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
-      { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
-    };
+        // This table *must* be in the order that the fixup_* kinds are defined
+        // in AArch64FixupKinds.h.
+        //
+        // Name                           Offset (bits) Size (bits)     Flags
+        {"fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal},
+        {"fixup_aarch64_add_imm12", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale1", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale2", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale4", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale8", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale16", 10, 12, 0},
+        {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal},
+        {"fixup_aarch64_movw", 5, 16, 0},
+        {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal},
+        {"fixup_aarch64_tlsdesc_call", 0, 0, 0}};
 
     if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
@@ -72,8 +71,9 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   bool mayNeedRelaxation(const MCInst &Inst) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -261,13 +261,15 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
   }
 }
 
-void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                   unsigned DataSize, uint64_t Value,
-                                   bool IsPCRel, MCContext &Ctx) const {
+void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                   const MCValue &Target,
+                                   MutableArrayRef<char> Data, uint64_t Value,
+                                   bool IsPCRel) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   if (!Value)
     return; // Doesn't change encoding.
   MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  MCContext &Ctx = Asm.getContext();
   // Apply any target-specific value adjustments.
   Value = adjustFixupValue(Fixup, Value, Ctx);
 
@@ -275,7 +277,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // Used to point to big endian bytes.
   unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
@@ -289,7 +291,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
     }
   } else {
     // Handle as big-endian
-    assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!");
+    assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!");
     assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
     for (unsigned i = 0; i != NumBytes; ++i) {
       unsigned Idx = FulleSizeInBytes - 1 - i;
@@ -539,16 +541,14 @@ public:
     return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32);
   }
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
+  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                         const MCValue &Target, bool &IsResolved) override;
 };
 
-void ELFAArch64AsmBackend::processFixupValue(
-    const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
-    const MCFragment *DF, const MCValue &Target, uint64_t &Value,
-    bool &IsResolved) {
+void ELFAArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
+                                             const MCFixup &Fixup,
+                                             const MCValue &Target,
+                                             bool &IsResolved) {
   // The ADRP instruction adds some multiple of 0x1000 to the current PC &
   // ~0xfff. This means that the required offset to reach a symbol can vary by
   // up to one step depending on where the ADRP is in memory. For example:
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 6ab2b9ef04598..7494e5decd6f6 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -238,6 +238,36 @@ def FeatureSDWA : SubtargetFeature<"sdwa",
   "Support SDWA (Sub-DWORD Addressing) extension"
 >;
 
+def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod",
+  "HasSDWAOmod",
+  "true",
+  "Support OMod with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar",
+  "HasSDWAScalar",
+  "true",
+  "Support scalar register with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst",
+  "HasSDWASdst",
+  "true",
+  "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
+  "HasSDWAMac",
+  "true",
+  "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc",
+  "HasSDWAClampVOPC",
+  "true",
+  "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
+>;
+
 def FeatureDPP : SubtargetFeature<"dpp",
   "HasDPP",
   "true",
@@ -421,8 +451,8 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
-   FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA,
-   FeatureDPP
+   FeatureScalarStores, FeatureInv2PiInlineImm,
+   FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP
   ]
 >;
 
@@ -432,7 +462,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
    FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
-   FeatureFastFMAF32, FeatureSDWA, FeatureDPP,
+   FeatureFastFMAF32, FeatureDPP,
+   FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
   ]
 >;
@@ -449,14 +480,14 @@ class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
 
 def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
   [FeatureSouthernIslands,
-   FeatureFastFMAF32, 
+   FeatureFastFMAF32,
    HalfRate64Ops,
    FeatureLDSBankCount32]>;
 
 def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
   [FeatureSouthernIslands,
    FeatureLDSBankCount32]>;
-   
+
 def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
   [FeatureSeaIslands,
    FeatureLDSBankCount32]>;
@@ -644,7 +675,11 @@ def isCIVI : Predicate <
   "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
   AssemblerPredicate<"FeatureCIInsts">;
 
-def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
+  AssemblerPredicate<"FeatureFlatAddressSpace">;
+
+def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
+  AssemblerPredicate<"FeatureFlatGlobalInsts">;
 
 def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
   AssemblerPredicate<"Feature16BitInsts">;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5586b513b5fca..96f819fd0e684 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3527,18 +3527,25 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 //===----------------------------------------------------------------------===//
 
 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
-                                                  const TargetRegisterClass *RC,
-                                                   unsigned Reg, EVT VT) const {
+                                                   const TargetRegisterClass *RC,
+                                                   unsigned Reg, EVT VT,
+                                                   const SDLoc &SL,
+                                                   bool RawReg) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned VirtualRegister;
+  unsigned VReg;
+
   if (!MRI.isLiveIn(Reg)) {
-    VirtualRegister = MRI.createVirtualRegister(RC);
-    MRI.addLiveIn(Reg, VirtualRegister);
+    VReg = MRI.createVirtualRegister(RC);
+    MRI.addLiveIn(Reg, VReg);
   } else {
-    VirtualRegister = MRI.getLiveInVirtReg(Reg);
+    VReg = MRI.getLiveInVirtReg(Reg);
   }
-  return DAG.getRegister(VirtualRegister, VT);
+
+  if (RawReg)
+    return DAG.getRegister(VReg, VT);
+
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
 }
 
 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
@@ -3657,6 +3664,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
   NODE_NAME_CASE(ATOMIC_INC)
   NODE_NAME_CASE(ATOMIC_DEC)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 0d066cdbdff4d..a45234e2b39f2 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -216,10 +216,25 @@ public:
   /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
   /// MachineFunction.
   ///
-  /// \returns a RegisterSDNode representing Reg.
-  virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
-                                       const TargetRegisterClass *RC,
-                                       unsigned Reg, EVT VT) const;
+  /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
+  /// a copy from the register.
+  SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                               const TargetRegisterClass *RC,
+                               unsigned Reg, EVT VT,
+                               const SDLoc &SL,
+                               bool RawReg = false) const;
+  SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                               const TargetRegisterClass *RC,
+                               unsigned Reg, EVT VT) const {
+    return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()));
+  }
+
+  // Returns the raw live in register rather than a copy from it.
+  SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG,
+                                  const TargetRegisterClass *RC,
+                                  unsigned Reg, EVT VT) const {
+    return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
+  }
 
   enum ImplicitParameter {
     FIRST_IMPLICIT,
@@ -388,6 +403,8 @@ enum NodeType : unsigned {
   STORE_MSKOR,
   LOAD_CONSTANT,
   TBUFFER_STORE_FORMAT,
+  TBUFFER_STORE_FORMAT_X3,
+  TBUFFER_LOAD_FORMAT,
   ATOMIC_CMP_SWAP,
   ATOMIC_INC,
   ATOMIC_DEC,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index a01f5d37c7c16..69dc529861729 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -66,7 +66,9 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
 enum SIEncodingFamily {
   SI = 0,
-  VI = 1
+  VI = 1,
+  SDWA = 2,
+  SDWA9 = 3
 };
 
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
@@ -101,7 +103,12 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
 }
 
 int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
-  int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST));
+  SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+  if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
+    Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
+                                                      : SIEncodingFamily::SDWA;
+
+  int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
 
   // -1 means that Opcode is already a native instruction.
   if (MCOp == -1)
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index e286558ce60d7..bcf89bb78ad66 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -70,6 +70,10 @@ def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
   [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
 >;
 
+def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
+  [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
+>;
+
 //===----------------------------------------------------------------------===//
 // AMDGPU DAG Nodes
 //
@@ -179,6 +183,12 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
 // out = (src1 > src0) ? 1 : 0
 def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
 
+// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own
+// nodes in TargetSelectionDAG.td.
+def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>;
+
+def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>;
+
 def AMDGPUSetCCOp : SDTypeProfile<1, 3, [        // setcc
   SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
 ]>;
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 790a69b843979..cc56216c355bf 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -29,12 +29,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   using namespace TargetOpcode;
 
   const LLT S1= LLT::scalar(1);
+  const LLT V2S16 = LLT::vector(2, 16);
   const LLT S32 = LLT::scalar(32);
   const LLT S64 = LLT::scalar(64);
   const LLT P1 = LLT::pointer(1, 64);
   const LLT P2 = LLT::pointer(2, 64);
 
   setAction({G_ADD, S32}, Legal);
+  setAction({G_AND, S32}, Legal);
+
+  setAction({G_BITCAST, V2S16}, Legal);
+  setAction({G_BITCAST, 1, S32}, Legal);
+
+  setAction({G_BITCAST, S32}, Legal);
+  setAction({G_BITCAST, 1, V2S16}, Legal);
 
   // FIXME: i1 operands to intrinsics should always be legal, but other i1
   // values may not be legal.  We need to figure out how to distinguish
@@ -61,6 +69,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   setAction({G_SELECT, S32}, Legal);
   setAction({G_SELECT, 1, S1}, Legal);
 
+  setAction({G_SHL, S32}, Legal);
+
   setAction({G_STORE, S32}, Legal);
   setAction({G_STORE, 1, P1}, Legal);
 
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 8d157e2f98f24..ab5abf2039a5b 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -124,6 +124,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasScalarStores(false),
     HasInv2PiInlineImm(false),
     HasSDWA(false),
+    HasSDWAOmod(false),
+    HasSDWAScalar(false),
+    HasSDWASdst(false),
+    HasSDWAMac(false),
+    HasSDWAClampVOPC(false),
     HasDPP(false),
     FlatAddressSpace(false),
     FlatInstOffsets(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 5f4f20316a6ba..2b16289c723ef 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -149,6 +149,11 @@ protected:
   bool HasScalarStores;
   bool HasInv2PiInlineImm;
   bool HasSDWA;
+  bool HasSDWAOmod;
+  bool HasSDWAScalar;
+  bool HasSDWASdst;
+  bool HasSDWAMac;
+  bool HasSDWAClampVOPC;
   bool HasDPP;
   bool FlatAddressSpace;
   bool FlatInstOffsets;
@@ -431,6 +436,26 @@ public:
     return HasSDWA;
   }
 
+  bool hasSDWAOmod() const {
+    return HasSDWAOmod;
+  }
+
+  bool hasSDWAScalar() const {
+    return HasSDWAScalar;
+  }
+
+  bool hasSDWASdst() const {
+    return HasSDWASdst;
+  }
+
+  bool hasSDWAMac() const {
+    return HasSDWAMac;
+  }
+
+  bool hasSDWAClampVOPC() const {
+    return HasSDWAClampVOPC;
+  }
+
   /// \brief Returns the offset in bytes from the start of the input buffer
   ///        of the first explicit kernel argument.
   unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b644eba536fa4..04fe9f689806c 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -342,6 +342,14 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
         PM.add(createAMDGPUExternalAAWrapperPass());
       }
   });
+
+  Builder.addExtension(
+    PassManagerBuilder::EP_CGSCCOptimizerLate,
+    [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+      // Add infer address spaces pass to the opt pipeline after inlining
+      // but before SROA to increase SROA opportunities.
+      PM.add(createInferAddressSpacesPass());
+  });
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 0d6689bd04c4e..88245b01683a5 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -184,9 +184,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
   }
 }
 
-unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
-  if (Vec)
-    return 0;
+unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+  // The concept of vector registers doesn't really exist. Some packed vector
+  // operations operate on the normal 32-bit registers.
 
   // Number of VGPRs on SI.
   if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
@@ -195,8 +195,18 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }
 
+unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
+  // This is really the number of registers to fill when vectorizing /
+  // interleaving loops, so we lie to avoid trying to use all registers.
+  return getHardwareNumberOfRegisters(Vec) >> 3;
+}
+
 unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
-  return Vector ? 0 : 32;
+  return 32;
+}
+
+unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
+  return 32;
 }
 
 unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
@@ -247,11 +257,11 @@ bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
 
 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Disable unrolling if the loop is not vectorized.
+  // TODO: Enable this again.
   if (VF == 1)
     return 1;
 
-  // Semi-arbitrary large amount.
-  return 64;
+  return 8;
 }
 
 int AMDGPUTTIImpl::getArithmeticInstrCost(
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index a60b1bb1b59c7..485e20411ab49 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -75,8 +75,10 @@ public:
     return TTI::PSK_FastHardware;
   }
 
-  unsigned getNumberOfRegisters(bool Vector);
-  unsigned getRegisterBitWidth(bool Vector) const;
+  unsigned getHardwareNumberOfRegisters(bool Vector) const;
+  unsigned getNumberOfRegisters(bool Vector) const;
+  unsigned getRegisterBitWidth(bool Vector) const ;
+  unsigned getMinVectorRegisterBitWidth() const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
 
   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 392e9d89bd9ba..7b8756050b752 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -152,6 +152,8 @@ public:
     ImmTyExpTgt,
     ImmTyExpCompr,
     ImmTyExpVM,
+    ImmTyDFMT,
+    ImmTyNFMT,
     ImmTyHwreg,
     ImmTyOff,
     ImmTySendMsg,
@@ -260,6 +262,8 @@ public:
     return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
   }
 
+  bool isSDWARegKind() const;
+
   bool isImmTy(ImmTy ImmT) const {
     return isImm() && Imm.Type == ImmT;
   }
@@ -292,6 +296,8 @@ public:
   bool isGLC() const { return isImmTy(ImmTyGLC); }
   bool isSLC() const { return isImmTy(ImmTySLC); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
+  bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
+  bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
   bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
   bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
   bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -636,6 +642,8 @@ public:
     case ImmTyGLC: OS << "GLC"; break;
     case ImmTySLC: OS << "SLC"; break;
     case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyDFMT: OS << "DFMT"; break;
+    case ImmTyNFMT: OS << "NFMT"; break;
     case ImmTyClampSI: OS << "ClampSI"; break;
     case ImmTyOModSI: OS << "OModSI"; break;
     case ImmTyDppCtrl: OS << "DppCtrl"; break;
@@ -993,7 +1001,9 @@ private:
   void errorExpTgt();
   OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
 
-  bool validateOperandLimitations(const MCInst &Inst);
+  bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc);
+  bool validateConstantBusLimitations(const MCInst &Inst);
+  bool validateEarlyClobberLimitations(const MCInst &Inst);
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
   unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1029,6 +1039,8 @@ public:
   void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
   void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
   void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+  void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
+
   AMDGPUOperand::Ptr defaultGLC() const;
   AMDGPUOperand::Ptr defaultSLC() const;
   AMDGPUOperand::Ptr defaultTFE() const;
@@ -1042,6 +1054,7 @@ public:
   AMDGPUOperand::Ptr defaultSMRDOffset20() const;
   AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
   AMDGPUOperand::Ptr defaultOffsetU12() const;
+  AMDGPUOperand::Ptr defaultOffsetS13() const;
 
   OperandMatchResultTy parseOModOperand(OperandVector &Operands);
 
@@ -1243,6 +1256,15 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
   return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
 }
 
+bool AMDGPUOperand::isSDWARegKind() const {
+  if (AsmParser->isVI())
+    return isVReg();
+  else if (AsmParser->isGFX9())
+    return isRegKind();
+  else
+    return false;
+}
+
 uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
 {
   assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
@@ -2083,7 +2105,7 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
          isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());
 }
 
-bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
+bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
   const unsigned Opcode = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opcode);
   unsigned ConstantBusUseCount = 0;
@@ -2137,6 +2159,60 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
   return ConstantBusUseCount <= 1;
 }
 
+bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
+
+  const unsigned Opcode = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opcode);
+
+  const int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+  if (DstIdx == -1 ||
+      Desc.getOperandConstraint(DstIdx, MCOI::EARLY_CLOBBER) == -1) {
+    return true;
+  }
+
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+  const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+  const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+  const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+  assert(DstIdx != -1);
+  const MCOperand &Dst = Inst.getOperand(DstIdx);
+  assert(Dst.isReg());
+  const unsigned DstReg = mc2PseudoReg(Dst.getReg());
+
+  const int SrcIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+  for (int SrcIdx : SrcIndices) {
+    if (SrcIdx == -1) break;
+    const MCOperand &Src = Inst.getOperand(SrcIdx);
+    if (Src.isReg()) {
+      const unsigned SrcReg = mc2PseudoReg(Src.getReg());
+      if (isRegIntersect(DstReg, SrcReg, TRI)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
+                                          const SMLoc &IDLoc) {
+  if (!validateConstantBusLimitations(Inst)) {
+    Error(IDLoc,
+      "invalid operand (violates constant bus restrictions)");
+    return false;
+  }
+  if (!validateEarlyClobberLimitations(Inst)) {
+    Error(IDLoc,
+      "destination must be different than all sources");
+    return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                               OperandVector &Operands,
                                               MCStreamer &Out,
@@ -2169,9 +2245,8 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   switch (Result) {
   default: break;
   case Match_Success:
-    if (!validateOperandLimitations(Inst)) {
-      return Error(IDLoc,
-                   "invalid operand (violates constant bus restrictions)");
+    if (!validateInstruction(Inst, IDLoc)) {
+      return true;
     }
     Inst.setLoc(IDLoc);
     Out.EmitInstruction(Inst, getSTI());
@@ -2554,11 +2629,21 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
         return MatchOperand_ParseFail;
 
       Parser.Lex();
+
+      bool IsMinus = false;
+      if (getLexer().getKind() == AsmToken::Minus) {
+        Parser.Lex();
+        IsMinus = true;
+      }
+
       if (getLexer().isNot(AsmToken::Integer))
         return MatchOperand_ParseFail;
 
       if (getParser().parseAbsoluteExpression(Int))
         return MatchOperand_ParseFail;
+
+      if (IsMinus)
+        Int = -Int;
       break;
     }
   }
@@ -3743,6 +3828,44 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
 }
 
+void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle the case where soffset is an immediate
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle tokens like 'offen' which are sometimes hard-coded into the
+    // asm string.  There are no MCInst operands for these.
+    if (Op.isToken()) {
+      continue;
+    }
+    assert(Op.isImm());
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                        AMDGPUOperand::ImmTyOffset);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+}
+
 //===----------------------------------------------------------------------===//
 // mimg
 //===----------------------------------------------------------------------===//
@@ -3870,6 +3993,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
 }
 
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
 //===----------------------------------------------------------------------===//
 // vop3
 //===----------------------------------------------------------------------===//
@@ -3919,6 +4046,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
   {"gds",     AMDGPUOperand::ImmTyGDS, true, nullptr},
   {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
+  {"dfmt",    AMDGPUOperand::ImmTyDFMT, false, nullptr},
+  {"nfmt",    AMDGPUOperand::ImmTyNFMT, false, nullptr},
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
   {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
@@ -4475,12 +4604,11 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
 
   if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&
       Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
-    // V_NOP_sdwa_vi has no optional sdwa arguments
+    // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments
     switch (BasicInstType) {
     case SIInstrFlags::VOP1:
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-      if (isGFX9() &&
-          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
         addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
       }
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
@@ -4490,8 +4618,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
 
     case SIInstrFlags::VOP2:
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-      if (isGFX9() &&
-          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
         addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
       }
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
@@ -4501,9 +4628,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
       break;
 
     case SIInstrFlags::VOPC:
-      if (isVI()) {
-        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-      }
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
       break;
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 2aca65ac84303..2e96c14eaa320 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -57,6 +57,11 @@ class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
   string OpName = NAME # suffix;
 }
 
+class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+  bit IsAddr64 = is_addr64;
+  string OpName = NAME # suffix;
+}
+
 //===----------------------------------------------------------------------===//
 // MTBUF classes
 //===----------------------------------------------------------------------===//
@@ -78,14 +83,31 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
   let EXP_CNT = 1;
   let MTBUF = 1;
   let Uses = [EXEC];
-
   let hasSideEffects = 0;
   let SchedRW = [WriteVMEM];
+
+  let AsmMatchConverter = "cvtMtbuf";
+
+  bits<1> offen       = 0;
+  bits<1> idxen       = 0;
+  bits<1> addr64      = 0;
+  bits<1> has_vdata   = 1;
+  bits<1> has_vaddr   = 1;
+  bits<1> has_glc     = 1;
+  bits<1> glc_value   = 0; // the value for glc if no such operand
+  bits<4> dfmt_value  = 1; // the value for dfmt if no such operand
+  bits<3> nfmt_value  = 0; // the value for nfmt if no such operand
+  bits<1> has_srsrc   = 1;
+  bits<1> has_soffset = 1;
+  bits<1> has_offset  = 1;
+  bits<1> has_slc     = 1;
+  bits<1> has_tfe     = 1;
+  bits<1> has_dfmt    = 1;
+  bits<1> has_nfmt    = 1;
 }
 
 class MTBUF_Real <MTBUF_Pseudo ps> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
-  Enc64 {
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -97,57 +119,168 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
 
-  bits<8> vdata;
   bits<12> offset;
-  bits<1> offen;
-  bits<1> idxen;
-  bits<1> glc;
-  bits<1> addr64;
-  bits<4> dfmt;
-  bits<3> nfmt;
-  bits<8> vaddr;
-  bits<7> srsrc;
-  bits<1> slc;
-  bits<1> tfe;
-  bits<8> soffset;
-
-  let Inst{11-0}  = offset;
-  let Inst{12}    = offen;
-  let Inst{13}    = idxen;
-  let Inst{14}    = glc;
-  let Inst{22-19} = dfmt;
-  let Inst{25-23} = nfmt;
-  let Inst{31-26} = 0x3a; //encoding
-  let Inst{39-32} = vaddr;
-  let Inst{47-40} = vdata;
-  let Inst{52-48} = srsrc{6-2};
-  let Inst{54}    = slc;
-  let Inst{55}    = tfe;
-  let Inst{63-56} = soffset;
+  bits<1>  glc;
+  bits<4>  dfmt;
+  bits<3>  nfmt;
+  bits<8>  vaddr;
+  bits<8>  vdata;
+  bits<7>  srsrc;
+  bits<1>  slc;
+  bits<1>  tfe;
+  bits<8>  soffset;
+}
+
+class getMTBUFInsDA<list<RegisterClass> vdataList,
+                    list<RegisterClass> vaddrList=[]> {
+  RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+  RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  dag InsNoData = !if(!empty(vaddrList),
+    (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+    (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+  );
+  dag InsData = !if(!empty(vaddrList),
+    (ins vdataClass:$vdata,                    SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         slc:$slc, tfe:$tfe),
+    (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         slc:$slc, tfe:$tfe)
+  );
+  dag ret = !if(!empty(vdataList), InsNoData, InsData);
 }
 
-class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
-  opName, (outs regClass:$dst),
-  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
-       i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
-  " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
-  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+  dag ret =
+    !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    (ins))))));
+}
+
+class getMTBUFAsmOps<int addrKind> {
+  string Pfx =
+    !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset",
+    !if(!eq(addrKind, BUFAddrKind.OffEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen",
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen",
+    !if(!eq(addrKind, BUFAddrKind.BothEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen",
+    !if(!eq(addrKind, BUFAddrKind.Addr64),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64",
+    "")))));
+  string ret = Pfx # "$offset";
+}
+
+class MTBUF_SetupAddr<int addrKind> {
+  bits<1> offen  = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> idxen  = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+
+  bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+}
+
+class MTBUF_Load_Pseudo <string opName,
+                         int addrKind,
+                         RegisterClass vdataClass,
+                         list<dag> pattern=[],
+                         // Workaround bug bz30254
+                         int addrKindCopy = addrKind>
+  : MTBUF_Pseudo<opName,
+                 (outs vdataClass:$vdata),
+                 getMTBUFIns<addrKindCopy>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 1;
   let mayStore = 0;
 }
 
-class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
-  opName, (outs),
-  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
-       i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
-       SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
-  " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
-  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+                              ValueType load_vt = i32,
+                              SDPatternOperator ld = null_frag> {
+
+  def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
+                      i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
+                      i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+class MTBUF_Store_Pseudo <string opName,
+                          int addrKind,
+                          RegisterClass vdataClass,
+                          list<dag> pattern=[],
+                          // Workaround bug bz30254
+                          int addrKindCopy = addrKind,
+                          RegisterClass vdataClassCopy = vdataClass>
+  : MTBUF_Pseudo<opName,
+                 (outs),
+                 getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 0;
   let mayStore = 1;
 }
 
+multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+                               ValueType store_vt = i32,
+                               SDPatternOperator st = null_frag> {
+
+  def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+
 //===----------------------------------------------------------------------===//
 // MUBUF classes
 //===----------------------------------------------------------------------===//
@@ -676,14 +809,14 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
 // MTBUF Instructions
 //===----------------------------------------------------------------------===//
 
-//def TBUFFER_LOAD_FORMAT_X    : MTBUF_ <0, "tbuffer_load_format_x", []>;
-//def TBUFFER_LOAD_FORMAT_XY   : MTBUF_ <1, "tbuffer_load_format_xy", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ  : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
-def TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Load_Pseudo  <"tbuffer_load_format_xyzw", VReg_128>;
-def TBUFFER_STORE_FORMAT_X    : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
-def TBUFFER_STORE_FORMAT_XY   : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
-def TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
-def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_x",     VGPR_32>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xy",    VReg_64>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyz",   VReg_128>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyzw",  VReg_128>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_x",    VGPR_32>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",   VReg_64>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
 
 } // End let SubtargetPredicate = isGCN
 
@@ -1093,22 +1226,98 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OF
 // MTBUF Patterns
 //===----------------------------------------------------------------------===//
 
-// TBUFFER_STORE_FORMAT_*, addr64=0
-class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
-  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
-                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
-                   imm:$nfmt, imm:$offen, imm:$idxen,
-                   imm:$glc, imm:$slc, imm:$tfe),
-  (opcode
-    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
-    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
-    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
->;
+//===----------------------------------------------------------------------===//
+// tbuffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                  string opcode> {
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
 
-def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
-def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
-def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
-def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32,   "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32,   "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
+
+multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                   string opcode> {
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
+                                (as_i16imm $offset), (as_i8imm $dfmt),
+                                (as_i8imm $nfmt), (as_i1imm $glc),
+                                (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt),
+                                   (as_i8imm $nfmt), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt),
+                                   (as_i8imm $nfmt), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
+          imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
+      $vdata,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32,   "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32,   "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
 
 } // End let Predicates = [isGCN]
 
@@ -1224,21 +1433,44 @@ def BUFFER_WBINVL1_si           : MUBUF_Real_si <0x71, BUFFER_WBINVL1>;
 
 class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
   let AssemblerPredicate=isSICI;
   let DecoderNamespace="SICI";
 
-  bits<1> addr64;
-  let Inst{15}    = addr64;
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{15}    = ps.addr64;
   let Inst{18-16} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_si  : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_si    : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_si   : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_si  : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
+  def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
+  def _OFFEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
 
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_si <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_si <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_si <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_si <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_si <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_si <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
 
 //===----------------------------------------------------------------------===//
 // CI
@@ -1350,16 +1582,39 @@ def BUFFER_WBINVL1_VOL_vi       : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
 
 class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
   let AssemblerPredicate=isVI;
   let DecoderNamespace="VI";
 
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
   let Inst{18-15} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_vi  : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_vi    : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_vi   : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_vi  : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
+  def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
 
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_vi <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_vi <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_vi <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_vi <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_vi <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_vi <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_vi <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 88c92b9582fd0..04308fb3aaf64 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -49,6 +49,17 @@ addOperand(MCInst &Inst, const MCOperand& Opnd) {
     MCDisassembler::SoftFail;
 }
 
+static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
+                                uint16_t NameIdx) {
+  int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx);
+  if (OpIdx != -1) {
+    auto I = MI.begin();
+    std::advance(I, OpIdx);
+    MI.insert(I, Op);
+  }
+  return OpIdx;
+}
+
 static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
                                        uint64_t Addr, const void *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
@@ -106,12 +117,12 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
 }
 
-#define DECODE_SDWA9(DecName) \
-DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName)
+#define DECODE_SDWA(DecName) \
+DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
 
-DECODE_SDWA9(Src32)
-DECODE_SDWA9(Src16)
-DECODE_SDWA9(VopcDst)
+DECODE_SDWA(Src32)
+DECODE_SDWA(Src16)
+DECODE_SDWA(VopcDst)
 
 #include "AMDGPUGenDisassemblerTables.inc"
 
@@ -149,6 +160,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                                 raw_ostream &WS,
                                                 raw_ostream &CS) const {
   CommentStream = &CS;
+  bool IsSDWA = false;
 
   // ToDo: AMDGPUDisassembler supports only VI ISA.
   if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding])
@@ -170,10 +182,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       if (Res) break;
 
       Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
-      if (Res) break;
+      if (Res) { IsSDWA = true;  break; }
 
       Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
-      if (Res) break;
+      if (Res) { IsSDWA = true;  break; }
     }
 
     // Reinitialize Bytes as DPP64 could have eaten too much
@@ -200,17 +212,36 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
               MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
               MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) {
     // Insert dummy unused src2_modifiers.
-    int Src2ModIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
-                                                AMDGPU::OpName::src2_modifiers);
-    auto I = MI.begin();
-    std::advance(I, Src2ModIdx);
-    MI.insert(I, MCOperand::createImm(0));
+    insertNamedMCOperand(MI, MCOperand::createImm(0),
+                         AMDGPU::OpName::src2_modifiers);
   }
 
+  if (Res && IsSDWA)
+    Res = convertSDWAInst(MI);
+
   Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
   return Res;
 }
 
+DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
+  if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+    if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1)
+      // VOPC - insert clamp
+      insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
+  } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+    int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
+    if (SDst != -1) {
+      // VOPC - insert VCC register as sdst
+      insertNamedMCOperand(MI, MCOperand::createReg(AMDGPU::VCC),
+                           AMDGPU::OpName::sdst);
+    } else {
+      // VOP1/2 - insert omod if present in instruction
+      insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
+    }
+  }
+  return MCDisassembler::Success;
+}
+
 const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
   return getContext().getRegisterInfo()->
     getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
@@ -524,8 +555,6 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
     return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
   }
 
-  assert(Width == OPW16 || Width == OPW32 || Width == OPW64);
-
   if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
     return decodeIntImmed(Val);
 
@@ -592,36 +621,43 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
   return errOperand(Val, "unknown operand encoding " + Twine(Val));
 }
 
-MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width,
-                                             unsigned Val) const {
+MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
+                                            unsigned Val) const {
   using namespace AMDGPU::SDWA;
 
-  if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
-      Val <= SDWA9EncValues::SRC_VGPR_MAX) {
-    return createRegOperand(getVgprClassId(Width),
-                            Val - SDWA9EncValues::SRC_VGPR_MIN);
-  } 
-  if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
-      Val <= SDWA9EncValues::SRC_SGPR_MAX) {
-    return createSRegOperand(getSgprClassId(Width),
-                             Val - SDWA9EncValues::SRC_SGPR_MIN);
-  }
+  if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+    if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
+        Val <= SDWA9EncValues::SRC_VGPR_MAX) {
+      return createRegOperand(getVgprClassId(Width),
+                              Val - SDWA9EncValues::SRC_VGPR_MIN);
+    }
+    if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
+        Val <= SDWA9EncValues::SRC_SGPR_MAX) {
+      return createSRegOperand(getSgprClassId(Width),
+                               Val - SDWA9EncValues::SRC_SGPR_MIN);
+    }
 
-  return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+    return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+  } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+    return createRegOperand(getVgprClassId(Width), Val);
+  }
+  llvm_unreachable("unsupported target");
 }
 
-MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const {
-  return decodeSDWA9Src(OPW16, Val);
+MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const {
+  return decodeSDWASrc(OPW16, Val);
 }
 
-MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const {
-  return decodeSDWA9Src(OPW32, Val);
+MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
+  return decodeSDWASrc(OPW32, Val);
 }
 
 
-MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const {
+MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
   using namespace AMDGPU::SDWA;
 
+  assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] &&
+         "SDWAVopcDst should be present only on GFX9");
   if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
     Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
     if (Val > AMDGPU::EncValues::SGPR_MAX) {
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5fa3cf1a223fa..3d71db909e20d 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -65,6 +65,8 @@ public:
                               uint64_t Inst,
                               uint64_t Address) const;
 
+  DecodeStatus convertSDWAInst(MCInst &MI) const;
+
   MCOperand decodeOperand_VGPR_32(unsigned Val) const;
   MCOperand decodeOperand_VS_32(unsigned Val) const;
   MCOperand decodeOperand_VS_64(unsigned Val) const;
@@ -105,10 +107,10 @@ public:
   MCOperand decodeSpecialReg32(unsigned Val) const;
   MCOperand decodeSpecialReg64(unsigned Val) const;
 
-  MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const;
-  MCOperand decodeSDWA9Src16(unsigned Val) const;
-  MCOperand decodeSDWA9Src32(unsigned Val) const;
-  MCOperand decodeSDWA9VopcDst(unsigned Val) const;
+  MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val) const;
+  MCOperand decodeSDWASrc16(unsigned Val) const;
+  MCOperand decodeSDWASrc32(unsigned Val) const;
+  MCOperand decodeSDWAVopcDst(unsigned Val) const;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 98eda288bcacb..edca6fcd812c8 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -31,8 +31,6 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   let VM_CNT = 1;
   let LGKM_CNT = 1;
 
-  let Uses = [EXEC, FLAT_SCR]; // M0
-
   let UseNamedOperandTable = 1;
   let hasSideEffects = 0;
   let SchedRW = [WriteVMEM];
@@ -40,10 +38,16 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   string Mnemonic = opName;
   string AsmOperands = asmOps;
 
+  bits<1> is_flat_global = 0;
+  bits<1> is_flat_scratch = 0;
+
   bits<1> has_vdst = 1;
   bits<1> has_data = 1;
   bits<1> has_glc  = 1;
   bits<1> glcValue = 0;
+
+  // TODO: M0 if it could possibly access LDS (before gfx9? only)?
+  let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]);
 }
 
 class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -68,7 +72,10 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
 
   // Only valid on gfx9
   bits<1> lds = 0; // XXX - What does this actually do?
-  bits<2> seg; // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+
+  // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+  bits<2> seg = !if(ps.is_flat_global, 0b10,
+                  !if(ps.is_flat_scratch, 0b01, 0));
 
   // Signed offset. Highest bit ignored for flat and treated as 12-bit
   // unsigned for flat acceses.
@@ -81,7 +88,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   // Only valid on GFX9+
   let Inst{12-0} = offset;
   let Inst{13} = lds;
-  let Inst{15-14} = 0;
+  let Inst{15-14} = seg;
 
   let Inst{16}    = !if(ps.has_glc, glc, ps.glcValue);
   let Inst{17}    = slc;
@@ -106,6 +113,16 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
   let mayLoad = 1;
 }
 
+class FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Load_Pseudo<opName, regClass, 1> {
+  let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Load_Pseudo<opName, regClass, 1> {
+  let is_flat_scratch = 1;
+}
+
 class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
   bit HasSignedOffset = 0> : FLAT_Pseudo<
   opName,
@@ -119,6 +136,16 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
   let has_vdst = 0;
 }
 
+class FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Store_Pseudo<opName, regClass, 1> {
+  let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Store_Pseudo<opName, regClass, 1> {
+  let is_flat_scratch = 1;
+}
+
 multiclass FLAT_Atomic_Pseudo<
   string opName,
   RegisterClass vdst_rc,
@@ -306,6 +333,26 @@ defm FLAT_ATOMIC_FMAX_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
 
 } // End SubtargetPredicate = isCI
 
+let SubtargetPredicate = HasFlatGlobalInsts in {
+def GLOBAL_LOAD_UBYTE    : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
+def GLOBAL_LOAD_SBYTE    : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
+def GLOBAL_LOAD_USHORT   : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
+def GLOBAL_LOAD_SSHORT   : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>;
+def GLOBAL_LOAD_DWORD    : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>;
+def GLOBAL_LOAD_DWORDX2  : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>;
+def GLOBAL_LOAD_DWORDX3  : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>;
+def GLOBAL_LOAD_DWORDX4  : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>;
+
+def GLOBAL_STORE_BYTE    : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
+def GLOBAL_STORE_SHORT   : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
+def GLOBAL_STORE_DWORD   : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>;
+def GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
+def GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
+def GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+
+} // End SubtargetPredicate = HasFlatGlobalInsts
+
+
 //===----------------------------------------------------------------------===//
 // Flat Patterns
 //===----------------------------------------------------------------------===//
@@ -557,3 +604,18 @@ defm FLAT_ATOMIC_XOR_X2     : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>;
 defm FLAT_ATOMIC_INC_X2     : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
 defm FLAT_ATOMIC_DEC_X2     : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
 
+def GLOBAL_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, GLOBAL_LOAD_UBYTE>;
+def GLOBAL_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, GLOBAL_LOAD_SBYTE>;
+def GLOBAL_LOAD_USHORT_vi : FLAT_Real_vi <0x12, GLOBAL_LOAD_USHORT>;
+def GLOBAL_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, GLOBAL_LOAD_SSHORT>;
+def GLOBAL_LOAD_DWORD_vi : FLAT_Real_vi <0x14, GLOBAL_LOAD_DWORD>;
+def GLOBAL_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, GLOBAL_LOAD_DWORDX2>;
+def GLOBAL_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, GLOBAL_LOAD_DWORDX4>;
+def GLOBAL_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, GLOBAL_LOAD_DWORDX3>;
+
+def GLOBAL_STORE_BYTE_vi : FLAT_Real_vi <0x18, GLOBAL_STORE_BYTE>;
+def GLOBAL_STORE_SHORT_vi : FLAT_Real_vi <0x1a, GLOBAL_STORE_SHORT>;
+def GLOBAL_STORE_DWORD_vi : FLAT_Real_vi <0x1c, GLOBAL_STORE_DWORD>;
+def GLOBAL_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, GLOBAL_STORE_DWORDX2>;
+def GLOBAL_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, GLOBAL_STORE_DWORDX4>;
+def GLOBAL_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, GLOBAL_STORE_DWORDX3>;
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index b84640230eeeb..7c31c8e397ba7 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -72,6 +72,11 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
 }
 
+void AMDGPUInstPrinter::printS16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  O << formatDec(static_cast<int16_t>(MI->getOperand(OpNo).getImm()));
+}
+
 void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
@@ -118,6 +123,16 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  uint16_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm != 0) {
+    O << ((OpNo == 0)? "offset:" : " offset:");
+    printS16ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
@@ -216,6 +231,24 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
     O << " vm";
 }
 
+void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " dfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " nfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
                                         const MCRegisterInfo &MRI) {
   switch (RegNo) {
@@ -379,7 +412,6 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
   uint16_t Lo16 = static_cast<uint16_t>(Imm);
-  assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
   printImmediate16(Lo16, STI, O);
 }
 
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index c8094c4b840a1..7bbf99a85f409 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -42,6 +42,7 @@ private:
   void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
@@ -52,6 +53,9 @@ private:
   void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                      raw_ostream &O);
+
   void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
   void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -84,6 +88,10 @@ private:
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpVM(const MCInst *MI, unsigned OpNo,
                   const MCSubtargetInfo &STI, raw_ostream &O);
+  void printDFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
 
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 0a9c2b94c1eee..2b408ff10caae 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -30,14 +30,9 @@ public:
 
   unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
 
-  void processFixupValue(const MCAssembler &Asm,
-                         const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override {
@@ -102,36 +97,11 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
 }
 
-void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                         const MCAsmLayout &Layout,
-                                         const MCFixup &Fixup, const MCFragment *DF,
-                                         const MCValue &Target, uint64_t &Value,
-                                         bool &IsResolved) {
-  MCValue Res;
-
-  // When we have complex expressions like: BB0_1 + (BB0_2 - 4), which are
-  // used for long branches, this function will be called with
-  // IsResolved = false and Value set to some pre-computed value.  In
-  // the example above, the value would be:
-  // (BB0_1 + (BB0_2 - 4)) - CurrentOffsetFromStartOfFunction.
-  // This is not what we want.  We just want the expression computation
-  // only.  The reason the MC layer subtracts the current offset from the
-  // expression is because the fixup is of kind FK_PCRel_4.
-  // For these scenarios, evaluateAsValue gives us the computation that we
-  // want.
-  if (!IsResolved && Fixup.getValue()->evaluateAsValue(Res, Layout) &&
-      Res.isAbsolute()) {
-    Value = Res.getConstant();
-    IsResolved = true;
-
-  }
-  if (IsResolved)
-    Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
-}
-
-void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                  unsigned DataSize, uint64_t Value,
-                                  bool IsPCRel, MCContext &Ctx) const {
+void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                  const MCValue &Target,
+                                  MutableArrayRef<char> Data, uint64_t Value,
+                                  bool IsPCRel) const {
+  Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
   if (!Value)
     return; // Doesn't change encoding.
 
@@ -142,7 +112,7 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   uint32_t Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // For each byte of the fragment that the fixup touches, mask in the bits from
   // the fixup value.
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index a856b17a228f0..1b062064ace1c 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -52,15 +52,15 @@ public:
     return 0;
   }
 
-  virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const {
+  virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
     return 0;
   }
 
-  virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+  virtual unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
     return 0;
   }
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index e02acf516c0db..376c9bfe5ccf2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -70,13 +70,13 @@ public:
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const override;
 
-  unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const override;
+  unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const override;
 
-  unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const override;
+  unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const override;
 };
 
 } // end anonymous namespace
@@ -252,9 +252,7 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
     uint16_t Lo16 = static_cast<uint16_t>(Imm);
-    assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
     uint32_t Encoding = getLit16Encoding(Lo16, STI);
-    assert(Encoding != 255 && "packed constants can only be inline immediates");
     return Encoding;
   }
   default:
@@ -328,11 +326,11 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned
-SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const {
+SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const {
   using namespace AMDGPU::SDWA;
-  
+
   uint64_t RegEnc = 0;
 
   const MCOperand &MO = MI.getOperand(OpNo);
@@ -347,9 +345,9 @@ SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned
-SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
+SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
   using namespace AMDGPU::SDWA;
 
   uint64_t RegEnc = 0;
@@ -365,6 +363,25 @@ SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
   return RegEnc;
 }
 
+static bool needsPCRel(const MCExpr *Expr) {
+  switch (Expr->getKind()) {
+  case MCExpr::SymbolRef:
+    return true;
+  case MCExpr::Binary: {
+    auto *BE = cast<MCBinaryExpr>(Expr);
+    if (BE->getOpcode() == MCBinaryExpr::Sub)
+      return false;
+    return needsPCRel(BE->getLHS()) || needsPCRel(BE->getRHS());
+  }
+  case MCExpr::Unary:
+    return needsPCRel(cast<MCUnaryExpr>(Expr)->getSubExpr());
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return false;
+  }
+  llvm_unreachable("invalid kind");
+}
+
 uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                             const MCOperand &MO,
                                        SmallVectorImpl<MCFixup> &Fixups,
@@ -373,12 +390,21 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
     return MRI.getEncodingValue(MO.getReg());
 
   if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
-    const auto *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+    // FIXME: If this is expression is PCRel or not should not depend on what
+    // the expression looks like. Given that this is just a general expression,
+    // it should probably be FK_Data_4 and whatever is producing
+    //
+    //    s_add_u32 s2, s2, (extern_const_addrspace+16
+    //
+    // And expecting a PCRel should instead produce
+    //
+    // .Ltmp1:
+    //   s_add_u32 s2, s2, (extern_const_addrspace+16)-.Ltmp1
     MCFixupKind Kind;
-    if (Expr && Expr->getSymbol().isExternal())
-      Kind = FK_Data_4;
-    else
+    if (needsPCRel(MO.getExpr()))
       Kind = FK_PCRel_4;
+    else
+      Kind = FK_Data_4;
     Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc()));
   }
 
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index f6f2582aa11b3..d30d1d382588c 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -80,7 +80,7 @@ def : Proc<"cayman",     R600_VLIW4_Itin,
 // Southern Islands
 //===----------------------------------------------------------------------===//
 
-def : ProcessorModel<"gfx600",     SIFullSpeedModel, 
+def : ProcessorModel<"gfx600",     SIFullSpeedModel,
   [FeatureISAVersion6_0_0]>;
 
 def : ProcessorModel<"SI",         SIFullSpeedModel,
@@ -95,7 +95,7 @@ def : ProcessorModel<"gfx601",     SIQuarterSpeedModel,
   [FeatureISAVersion6_0_1]
 >;
 
-def : ProcessorModel<"pitcairn",   SIQuarterSpeedModel, 
+def : ProcessorModel<"pitcairn",   SIQuarterSpeedModel,
   [FeatureISAVersion6_0_1]>;
 
 def : ProcessorModel<"verde",      SIQuarterSpeedModel,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index c55878f8bff0f..215791f4f92dd 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -584,23 +584,23 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
     case Intrinsic::r600_read_tgid_x:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_X, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T1_X, VT);
     case Intrinsic::r600_read_tgid_y:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_Y, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T1_Y, VT);
     case Intrinsic::r600_read_tgid_z:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_Z, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T1_Z, VT);
     case Intrinsic::r600_read_tidig_x:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_X, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T0_X, VT);
     case Intrinsic::r600_read_tidig_y:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_Y, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T0_Y, VT);
     case Intrinsic::r600_read_tidig_z:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_Z, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T0_Z, VT);
 
     case Intrinsic::r600_recipsqrt_ieee:
       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 5cd90323ff67b..3915c0e5bdbed 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -118,9 +118,9 @@ namespace AMDGPU {
     // Operand for source modifiers for VOP instructions
     OPERAND_INPUT_MODS,
 
-    // Operand for GFX9 SDWA instructions
-    OPERAND_SDWA9_SRC,
-    OPERAND_SDWA9_VOPC_DST,
+    // Operand for SDWA instructions
+    OPERAND_SDWA_SRC,
+    OPERAND_SDWA_VOPC_DST,
 
     /// Operand with 32-bit immediate that uses the constant bus.
     OPERAND_KIMM32,
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 5f5f25103c027..0a795c99f94e5 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -174,6 +174,31 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
   return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
 }
 
+static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
+                                      const SIRegisterInfo *TRI,
+                                      const SIInstrInfo *TII) {
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  auto &Src = MI.getOperand(1);
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = Src.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      !TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+
+  for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
+    const auto *UseMI = MO.getParent();
+    if (UseMI == &MI)
+      continue;
+    if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
+        UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
+        !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
+      return false;
+  }
+  // Change VGPR to SGPR destination.
+  MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
+  return true;
+}
+
 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
 //
 // SGPRx = ...
@@ -214,6 +239,9 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
   if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
     return false;
 
+  if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII))
+    return true;
+
   // TODO: Could have multiple extracts?
   unsigned SubReg = CopyUse.getOperand(1).getSubReg();
   if (SubReg != AMDGPU::NoSubRegister)
@@ -563,6 +591,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
             break;
           }
           TII->moveToVALU(MI);
+        } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
+          tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
         }
 
         break;
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index e10f1ed3762e8..f391f67a241f1 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -13,6 +13,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -166,6 +167,8 @@ static bool updateOperand(FoldCandidate &Fold,
   if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
       TargetRegisterInfo::isVirtualRegister(New->getReg())) {
     Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+
+    Old.setIsUndef(New->isUndef());
     return true;
   }
 
@@ -470,7 +473,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
       return &Op;
 
     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
-    if (Def->isMoveImmediate()) {
+    if (Def && Def->isMoveImmediate()) {
       MachineOperand &ImmSrc = Def->getOperand(1);
       if (ImmSrc.isImm())
         return &ImmSrc;
@@ -921,12 +924,9 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   // level.
   bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
+  for (MachineBasicBlock *MBB : depth_first(&MF)) {
     MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    for (I = MBB->begin(); I != MBB->end(); I = Next) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index b1bd14e421f02..08a64de385018 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -284,7 +284,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
-  if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
+  if (ST.isAmdCodeObjectV2(MF)) {
     PreloadedPrivateBufferReg = TRI->getPreloadedValue(
       MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
   }
@@ -363,14 +363,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     // Use relocations to get the pointer, and setup the other bits manually.
     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
 
-    if (MFI->hasPrivateMemoryInputPtr()) {
+    if (MFI->hasImplicitBufferPtr()) {
       unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
 
       if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
 
         BuildMI(MBB, I, DL, Mov64, Rsrc01)
-          .addReg(PreloadedPrivateBufferReg)
+          .addReg(MFI->getImplicitBufferPtrUserSGPR())
           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
       } else {
         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
@@ -385,7 +385,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                            MachineMemOperand::MODereferenceable,
                                            0, 0);
         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
-          .addReg(PreloadedPrivateBufferReg)
+          .addReg(MFI->getImplicitBufferPtrUserSGPR())
           .addImm(0) // offset
           .addImm(0) // glc
           .addMemOperand(MMO)
@@ -417,14 +417,69 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
 
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  if (MFI->isEntryFunction())
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  if (FuncInfo->isEntryFunction()) {
     emitEntryFunctionPrologue(MF, MBB);
+    return;
+  }
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+  unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL;
+
+  bool NeedFP = hasFP(MF);
+  if (NeedFP) {
+    // If we need a base pointer, set it up here. It's whatever the value of
+    // the stack pointer is at this point. Any variable size objects will be
+    // allocated after this, so we can still use the base pointer to reference
+    // locals.
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
+      .addReg(StackPtrReg)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  uint32_t NumBytes = MFI.getStackSize();
+  if (NumBytes != 0 && hasSP(MF)) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
+      .addReg(StackPtrReg)
+      .addImm(NumBytes * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
 }
 
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  if (FuncInfo->isEntryFunction())
+    return;
 
+  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+  if (StackPtrReg == AMDGPU::NoRegister)
+    return;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  uint32_t NumBytes = MFI.getStackSize();
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc DL;
+
+  // FIXME: Clarify distinction between no set SP and SP. For callee functions,
+  // it's really whether we need SP to be accurate or not.
+
+  if (NumBytes != 0 && hasSP(MF)) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
+      .addReg(StackPtrReg)
+      .addImm(NumBytes * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameDestroy);
+  }
 }
 
 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
@@ -557,3 +612,19 @@ void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
       WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
   }
 }
+
+bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
+  // All stack operations are relative to the frame offset SGPR.
+  // TODO: Still want to eliminate sometimes.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // XXX - Is this only called after frame is finalized? Should be able to check
+  // frame size.
+  return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI);
+}
+
+bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+  // All stack operations are relative to the frame offset SGPR.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MFI.hasCalls() || MFI.hasVarSizedObjects();
+}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index e17adbe273614..d4dfa1c7eaa86 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -60,6 +60,10 @@ private:
 
   /// \brief Emits debugger prologue.
   void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+public:
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasSP(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 441f1ef4bd04c..d0f4e00994de1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -211,6 +211,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::UADDO, MVT::i32, Legal);
   setOperationAction(ISD::USUBO, MVT::i32, Legal);
 
+  setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
+  setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
@@ -471,6 +474,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT, VT, Custom);
   }
 
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::ADDCARRY);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::SUBCARRY);
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
@@ -1061,10 +1068,10 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
                                  MachineFunction &MF,
                                  const SIRegisterInfo &TRI,
                                  SIMachineFunctionInfo &Info) {
-  if (Info.hasPrivateMemoryInputPtr()) {
-    unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI);
-    MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass);
-    CCInfo.AllocateReg(PrivateMemoryPtrReg);
+  if (Info.hasImplicitBufferPtr()) {
+    unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
+    MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(ImplicitBufferPtrReg);
   }
 
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
@@ -1227,7 +1234,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
     }
   }
 
-  if (NeedSP){
+  if (NeedSP) {
     unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
     Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
 
@@ -2998,7 +3005,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_implicit_buffer_ptr: {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+    if (getSubtarget()->isAmdCodeObjectV2(MF))
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    unsigned Reg = TRI->getPreloadedValue(MF,
+                                          SIRegisterInfo::IMPLICIT_BUFFER_PTR);
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
   }
   case Intrinsic::amdgcn_dispatch_ptr:
@@ -3288,6 +3299,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const {
   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   SDLoc DL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
   case Intrinsic::amdgcn_atomic_dec: {
@@ -3313,7 +3326,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(5), // glc
       Op.getOperand(6)  // slc
     };
-    MachineFunction &MF = DAG.getMachineFunction();
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
@@ -3328,6 +3340,29 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
   }
+  case Intrinsic::amdgcn_tbuffer_load: {
+    SDValue Ops[] = {
+      Op.getOperand(0),  // Chain
+      Op.getOperand(2),  // rsrc
+      Op.getOperand(3),  // vindex
+      Op.getOperand(4),  // voffset
+      Op.getOperand(5),  // soffset
+      Op.getOperand(6),  // offset
+      Op.getOperand(7),  // dfmt
+      Op.getOperand(8),  // nfmt
+      Op.getOperand(9),  // glc
+      Op.getOperand(10)   // slc
+    };
+
+    EVT VT = Op.getOperand(2).getValueType();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad,
+      VT.getStoreSize(), VT.getStoreSize());
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
   // Basic sample.
   case Intrinsic::amdgcn_image_sample:
   case Intrinsic::amdgcn_image_sample_cl:
@@ -3393,10 +3428,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  MachineFunction &MF = DAG.getMachineFunction();
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_exp: {
@@ -3463,33 +3498,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
                        Op.getOperand(2), Op.getOperand(3));
   }
-  case AMDGPUIntrinsic::SI_tbuffer_store: {
-    SDValue Ops[] = {
-      Chain,
-      Op.getOperand(2),
-      Op.getOperand(3),
-      Op.getOperand(4),
-      Op.getOperand(5),
-      Op.getOperand(6),
-      Op.getOperand(7),
-      Op.getOperand(8),
-      Op.getOperand(9),
-      Op.getOperand(10),
-      Op.getOperand(11),
-      Op.getOperand(12),
-      Op.getOperand(13),
-      Op.getOperand(14)
-    };
-
-    EVT VT = Op.getOperand(3).getValueType();
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
-  }
   case AMDGPUIntrinsic::AMDGPU_kill: {
     SDValue Src = Op.getOperand(2);
     if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3505,7 +3513,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   }
   case Intrinsic::amdgcn_s_barrier: {
     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
-      const MachineFunction &MF = DAG.getMachineFunction();
       const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
       unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
       if (WGSize <= ST.getWavefrontSize())
@@ -3514,6 +3521,75 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     }
     return SDValue();
   };
+  case AMDGPUIntrinsic::SI_tbuffer_store: {
+
+    // Extract vindex and voffset from vaddr as appropriate
+    const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
+    const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
+    SDValue VAddr = Op.getOperand(5);
+
+    SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+
+    assert(!(OffEn->isOne() && IdxEn->isOne()) &&
+           "Legacy intrinsic doesn't support both offset and index - use new version");
+
+    SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
+    SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
+
+    // Deal with the vec-3 case
+    const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
+    auto Opcode = NumChannels->getZExtValue() == 3 ?
+      AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
+
+    SDValue Ops[] = {
+     Chain,
+     Op.getOperand(3),  // vdata
+     Op.getOperand(2),  // rsrc
+     VIndex,
+     VOffset,
+     Op.getOperand(6),  // soffset
+     Op.getOperand(7),  // inst_offset
+     Op.getOperand(8),  // dfmt
+     Op.getOperand(9),  // nfmt
+     Op.getOperand(12), // glc
+     Op.getOperand(13), // slc
+    };
+
+    assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
+           "Value of tfe other than zero is unsupported");
+
+    EVT VT = Op.getOperand(3).getValueType();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(Opcode, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+
+  case Intrinsic::amdgcn_tbuffer_store: {
+    SDValue Ops[] = {
+      Chain,
+      Op.getOperand(2),  // vdata
+      Op.getOperand(3),  // rsrc
+      Op.getOperand(4),  // vindex
+      Op.getOperand(5),  // voffset
+      Op.getOperand(6),  // soffset
+      Op.getOperand(7),  // offset
+      Op.getOperand(8),  // dfmt
+      Op.getOperand(9),  // nfmt
+      Op.getOperand(10), // glc
+      Op.getOperand(11)  // slc
+    };
+    EVT VT = Op.getOperand(3).getValueType();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+
   default:
     return Op;
   }
@@ -4839,6 +4915,103 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
   return 0;
 }
 
+SDValue SITargetLowering::performAddCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDLoc SL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // add x, zext (setcc) => addcarry x, 0, setcc
+  // add x, sext (setcc) => subcarry x, 0, setcc
+  unsigned Opc = LHS.getOpcode();
+  if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
+      Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
+    std::swap(RHS, LHS);
+
+  Opc = RHS.getOpcode();
+  switch (Opc) {
+  default: break;
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ANY_EXTEND: {
+    auto Cond = RHS.getOperand(0);
+    if (Cond.getOpcode() != ISD::SETCC &&
+        Cond.getOpcode() != AMDGPUISD::FP_CLASS)
+      break;
+    SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
+    SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
+    Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
+    return DAG.getNode(Opc, SL, VTList, Args);
+  }
+  case ISD::ADDCARRY: {
+    // add x, (addcarry y, 0, cc) => addcarry x, y, cc
+    auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+    if (!C || C->getZExtValue() != 0) break;
+    SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
+    return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
+  }
+  }
+  return SDValue();
+}
+
+SDValue SITargetLowering::performSubCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDLoc SL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  unsigned Opc = LHS.getOpcode();
+  if (Opc != ISD::SUBCARRY)
+    std::swap(RHS, LHS);
+
+  if (LHS.getOpcode() == ISD::SUBCARRY) {
+    // sub (subcarry x, 0, cc), y => subcarry x, y, cc
+    auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+    if (!C || C->getZExtValue() != 0)
+      return SDValue();
+    SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
+    return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
+  }
+  return SDValue();
+}
+
+SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
+  DAGCombinerInfo &DCI) const {
+
+  if (N->getValueType(0) != MVT::i32)
+    return SDValue();
+
+  auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C || C->getZExtValue() != 0)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue LHS = N->getOperand(0);
+
+  // addcarry (add x, y), 0, cc => addcarry x, y, cc
+  // subcarry (sub x, y), 0, cc => subcarry x, y, cc
+  unsigned LHSOpc = LHS.getOpcode();
+  unsigned Opc = N->getOpcode();
+  if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
+      (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
+    SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
+    return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
+  }
+  return SDValue();
+}
+
 SDValue SITargetLowering::performFAddCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
@@ -5009,6 +5182,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
   default:
     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+  case ISD::ADD:
+    return performAddCombine(N, DCI);
+  case ISD::SUB:
+    return performSubCombine(N, DCI);
+  case ISD::ADDCARRY:
+  case ISD::SUBCARRY:
+    return performAddCarrySubCarryCombine(N, DCI);
   case ISD::FADD:
     return performFAddCombine(N, DCI);
   case ISD::FSUB:
@@ -5425,15 +5605,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
 }
 
-SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
-                                               const TargetRegisterClass *RC,
-                                               unsigned Reg, EVT VT) const {
-  SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
-
-  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
-                            cast<RegisterSDNode>(VReg)->getReg(), VT);
-}
-
 //===----------------------------------------------------------------------===//
 //                         SI Inline Assembly Support
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 8e2ec40b224cd..24f88e632d38e 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -108,6 +108,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
+  SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -216,8 +219,6 @@ public:
   void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                      SDNode *Node) const override;
 
-  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
-                               unsigned Reg, EVT VT) const override;
   SDNode *legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
 
   MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL,
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1097814e99ce2..c9b48fea7225e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2108,7 +2108,9 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
 
 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
                                    uint8_t OperandType) const {
-  if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
+  if (!MO.isImm() ||
+      OperandType < AMDGPU::OPERAND_SRC_FIRST ||
+      OperandType > AMDGPU::OPERAND_SRC_LAST)
     return false;
 
   // MachineOperand provides no way to tell the true operand size, since it only
@@ -2433,8 +2435,73 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  // Verify SDWA
+  if (isSDWA(MI)) {
+
+    if (!ST.hasSDWA()) {
+      ErrInfo = "SDWA is not supported on this target";
+      return false;
+    }
+
+    int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+    if ( DstIdx == -1)
+      DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::sdst);
+
+    const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
+
+    for (int OpIdx: OpIndicies) {
+      if (OpIdx == -1)
+        continue;
+      const MachineOperand &MO = MI.getOperand(OpIdx);
+
+      if (!ST.hasSDWAScalar()) {
+        // Only VGPRS on VI
+        if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
+          ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
+          return false;
+        }
+      } else {
+        // No immediates on GFX9
+        if (!MO.isReg()) {
+          ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
+          return false;
+        }
+      }
+    }
+
+    if (!ST.hasSDWAOmod()) {
+      // No omod allowed on VI
+      const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
+      if (OMod != nullptr &&
+        (!OMod->isImm() || OMod->getImm() != 0)) {
+        ErrInfo = "OMod not allowed in SDWA instructions on VI";
+        return false;
+      }
+    }
+
+    uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+    if (isVOPC(BasicOpcode)) {
+      if (!ST.hasSDWASdst() && DstIdx != -1) {
+        // Only vcc allowed as dst on VI for VOPC
+        const MachineOperand &Dst = MI.getOperand(DstIdx);
+        if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
+          ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
+          return false;
+        }
+      } else if (!ST.hasSDWAClampVOPC()) {
+        // No clamp allowed on GFX9 for VOPC
+        const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
+        if (Clamp != nullptr &&
+          (!Clamp->isImm() || Clamp->getImm() != 0)) {
+          ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
+          return false;
+        }
+      }
+    }
+  }
+
   // Verify VOP*
-  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) {
+  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
     // Only look at the true operands. Only a real operand can use the constant
     // bus, and we don't want to check pseudo-operands like the source modifier
     // flags.
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index f6e5e8883f63c..74b48c7618087 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -814,6 +814,9 @@ namespace AMDGPU {
   int getSDWAOp(uint16_t Opcode);
 
   LLVM_READONLY
+  int getBasicFromSDWAOp(uint16_t Opcode);
+
+  LLVM_READONLY
   int getCommuteRev(uint16_t Opcode);
 
   LLVM_READONLY
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 470a47b024433..3b4a8b5d1e817 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -20,6 +20,8 @@ def SIEncodingFamily {
   int NONE = -1;
   int SI = 0;
   int VI = 1;
+  int SDWA = 2;
+  int SDWA9 = 3;
 }
 
 //===----------------------------------------------------------------------===//
@@ -39,25 +41,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
-def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
-  SDTypeProfile<0, 13,
-    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
-     SDTCisVT<1, iAny>,   // vdata(VGPR)
-     SDTCisVT<2, i32>,    // num_channels(imm)
-     SDTCisVT<3, i32>,    // vaddr(VGPR)
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
+  SDTypeProfile<1, 9,
+    [                     // vdata
+     SDTCisVT<1, v4i32>,  // rsrc
+     SDTCisVT<2, i32>,    // vindex(VGPR)
+     SDTCisVT<3, i32>,    // voffset(VGPR)
      SDTCisVT<4, i32>,    // soffset(SGPR)
-     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<5, i32>,    // offset(imm)
      SDTCisVT<6, i32>,    // dfmt(imm)
      SDTCisVT<7, i32>,    // nfmt(imm)
-     SDTCisVT<8, i32>,    // offen(imm)
-     SDTCisVT<9, i32>,    // idxen(imm)
-     SDTCisVT<10, i32>,   // glc(imm)
-     SDTCisVT<11, i32>,   // slc(imm)
-     SDTCisVT<12, i32>    // tfe(imm)
+     SDTCisVT<8, i32>,    // glc(imm)
+     SDTCisVT<9, i32>     // slc(imm)
     ]>,
-  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+  [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
 >;
 
+def SDTtbuffer_store : SDTypeProfile<0, 10,
+    [                     // vdata
+     SDTCisVT<1, v4i32>,  // rsrc
+     SDTCisVT<2, i32>,    // vindex(VGPR)
+     SDTCisVT<3, i32>,    // voffset(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // glc(imm)
+     SDTCisVT<9, i32>     // slc(imm)
+    ]>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
+                             [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
+                                SDTtbuffer_store,
+                                [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+
 def SDTBufferLoad : SDTypeProfile<1, 5,
     [                    // vdata
      SDTCisVT<1, v4i32>, // rsrc
@@ -452,25 +470,25 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
   let ParserMatchClass = VReg32OrOffClass;
 }
 
-class SDWA9Src : RegisterOperand<VS_32> {
+class SDWASrc : RegisterOperand<VS_32> {
   let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_SDWA9_SRC";
-  let EncoderMethod = "getSDWA9SrcEncoding";
+  let OperandType = "OPERAND_SDWA_SRC";
+  let EncoderMethod = "getSDWASrcEncoding";
 }
 
-def SDWA9Src32 : SDWA9Src {
-  let DecoderMethod = "decodeSDWA9Src32";
+def SDWASrc32 : SDWASrc {
+  let DecoderMethod = "decodeSDWASrc32";
 }
 
-def SDWA9Src16 : SDWA9Src {
-  let DecoderMethod = "decodeSDWA9Src16";
+def SDWASrc16 : SDWASrc {
+  let DecoderMethod = "decodeSDWASrc16";
 }
 
-def SDWA9VopcDst : VOPDstOperand<SReg_64> {
+def SDWAVopcDst : VOPDstOperand<SReg_64> {
   let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_SDWA9_VOPC_DST";
-  let EncoderMethod = "getSDWA9VopcDstEncoding";
-  let DecoderMethod = "decodeSDWA9VopcDst";
+  let OperandType = "OPERAND_SDWA_VOPC_DST";
+  let EncoderMethod = "getSDWAVopcDstEncoding";
+  let DecoderMethod = "decodeSDWAVopcDst";
 }
 
 class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
@@ -525,7 +543,7 @@ def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
 def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
 
 def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>;
-def offset_s13 : NamedOperandS13<"Offset", NamedMatchClass<"OffsetS13">>;
+def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>;
 def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
 def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
 def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
@@ -545,6 +563,9 @@ def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
 def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
 def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
 
+def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
+def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
+
 def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
 def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
@@ -634,13 +655,13 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
 def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
 def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
 
-def FPRegInputModsMatchClass : AsmOperandClass {
-  let Name = "RegWithFPInputMods";
+def FPRegSDWAInputModsMatchClass : AsmOperandClass {
+  let Name = "SDWARegWithFPInputMods";
   let ParserMethod = "parseRegWithFPInputMods";
-  let PredicateMethod = "isRegKind";
+  let PredicateMethod = "isSDWARegKind";
 }
 
-def FPRegInputMods : InputMods <FPRegInputModsMatchClass> {
+def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> {
   let PrintMethod = "printOperandAndFPInputMods";
 }
 
@@ -655,13 +676,13 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
 }
 
 
-def IntRegInputModsMatchClass : AsmOperandClass {
-  let Name = "RegWithIntInputMods";
+def IntRegSDWAInputModsMatchClass : AsmOperandClass {
+  let Name = "SDWARegWithIntInputMods";
   let ParserMethod = "parseRegWithIntInputMods";
-  let PredicateMethod = "isRegKind";
+  let PredicateMethod = "isSDWARegKind";
 }
 
-def IntRegInputMods : InputMods <IntRegInputModsMatchClass> {
+def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> {
   let PrintMethod = "printOperandAndIntInputMods";
 }
 
@@ -851,10 +872,10 @@ class getVALUDstForVT<ValueType VT> {
 }
 
 // Returns the register class to use for the destination of VOP[12C]
-// instructions with GFX9 SDWA extension
-class getSDWA9DstForVT<ValueType VT> {
+// instructions with SDWA extension
+class getSDWADstForVT<ValueType VT> {
   RegisterOperand ret = !if(!eq(VT.Size, 1),
-                            SDWA9VopcDst, // VOPC
+                            SDWAVopcDst, // VOPC
                             VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst
 }
 
@@ -898,8 +919,8 @@ class getVregSrcForVT<ValueType VT> {
                         !if(!eq(VT.Size, 64), VReg_64, VGPR_32));
 }
 
-class getSDWA9SrcForVT <ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32);
+class getSDWASrcForVT <ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32);
 }
 
 // Returns the register class to use for sources of VOP3 instructions for the
@@ -995,7 +1016,7 @@ class getSrcMod <ValueType VT> {
                      );
 }
 
-// Return type of input modifiers operand specified input operand for SDWA/DPP
+// Return type of input modifiers operand specified input operand for DPP
 class getSrcModExt <ValueType VT> {
     bit isFP = !if(!eq(VT.Value, f16.Value), 1,
                !if(!eq(VT.Value, f32.Value), 1,
@@ -1004,13 +1025,13 @@ class getSrcModExt <ValueType VT> {
   Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
 }
 
-// Return type of input modifiers operand specified input operand for SDWA 9
-class getSrcModSDWA9 <ValueType VT> {
+// Return type of input modifiers operand specified input operand for SDWA
+class getSrcModSDWA <ValueType VT> {
     bit isFP = !if(!eq(VT.Value, f16.Value), 1,
                !if(!eq(VT.Value, f32.Value), 1,
                !if(!eq(VT.Value, f64.Value), 1,
                0)));
-  Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods);
+  Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods);
 }
 
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1141,36 +1162,12 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
              /* endif */)));
 }
 
-class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
-                  bit HasFloatModifiers, Operand Src0Mod, Operand Src1Mod,
-                  ValueType DstVT> {
 
-  dag ret = !if(!eq(NumSrcArgs, 0),
-               // VOP1 without input operands (V_NOP)
-               (ins),
-            !if(!eq(NumSrcArgs, 1),
-               // VOP1_SDWA
-               (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-                    clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
-                    src0_sel:$src0_sel),
-            !if(!eq(NumSrcArgs, 2),
-               !if(!eq(DstVT.Size, 1),
-                  // VOPC_SDWA with modifiers
-                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-                       Src1Mod:$src1_modifiers, Src1RC:$src1,
-                       clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
-                  // VOP2_SDWA with modifiers
-                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-                       Src1Mod:$src1_modifiers, Src1RC:$src1,
-                       clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
-                       src0_sel:$src0_sel, src1_sel:$src1_sel)),
-            (ins)/* endif */)));
-}
 
-// Ins for GFX9 SDWA
-class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
-                   bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,
-                   ValueType DstVT> {
+// Ins for SDWA
+class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
+                  bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,
+                  ValueType DstVT> {
 
   dag ret = !if(!eq(NumSrcArgs, 0),
                // VOP1 without input operands (V_NOP)
@@ -1178,31 +1175,31 @@ class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArg
             !if(!eq(NumSrcArgs, 1),
                // VOP1
                !if(!eq(HasSDWAOMod, 0),
-                  // VOP1_SDWA9 without omod
+                  // VOP1_SDWA without omod
                   (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                        clampmod:$clamp,
                        dst_sel:$dst_sel, dst_unused:$dst_unused,
                        src0_sel:$src0_sel),
-                  // VOP1_SDWA9 with omod
+                  // VOP1_SDWA with omod
                   (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                        clampmod:$clamp, omod:$omod,
                        dst_sel:$dst_sel, dst_unused:$dst_unused,
                        src0_sel:$src0_sel)),
             !if(!eq(NumSrcArgs, 2),
                !if(!eq(DstVT.Size, 1),
-                  // VOPC_SDWA9
+                  // VOPC_SDWA
                   (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                        Src1Mod:$src1_modifiers, Src1RC:$src1,
-                       src0_sel:$src0_sel, src1_sel:$src1_sel),
-                  // VOP2_SDWA9
+                       clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
+                  // VOP2_SDWA
                   !if(!eq(HasSDWAOMod, 0),
-                     // VOP2_SDWA9 without omod
+                     // VOP2_SDWA without omod
                      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                           Src1Mod:$src1_modifiers, Src1RC:$src1,
                           clampmod:$clamp,
                           dst_sel:$dst_sel, dst_unused:$dst_unused,
                           src0_sel:$src0_sel, src1_sel:$src1_sel),
-                     // VOP1_SDWA9 with omod
+                     // VOP2_SDWA with omod
                      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                           Src1Mod:$src1_modifiers, Src1RC:$src1,
                           clampmod:$clamp, omod:$omod,
@@ -1220,12 +1217,12 @@ class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> {
                 (outs)); // V_NOP
 }
 
-// Outs for GFX9 SDWA
-class getOutsSDWA9 <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA9> {
+// Outs for SDWA
+class getOutsSDWA <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA> {
   dag ret = !if(HasDst,
                 !if(!eq(DstVT.Size, 1),
-                    (outs DstRCSDWA9:$sdst),
-                    (outs DstRCSDWA9:$vdst)),
+                    (outs DstRCSDWA:$sdst),
+                    (outs DstRCSDWA:$vdst)),
                 (outs)); // V_NOP
 }
 
@@ -1387,8 +1384,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field ValueType Src2VT = ArgVT[3];
   field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
   field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
-  field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;
-  field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT<DstVT>.ret;
+  field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret;
   field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
   field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
   field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
@@ -1396,19 +1392,15 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
   field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
   field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
-  field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
-  field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
-  field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
-  field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
+  field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
+  field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
   field Operand Src0Mod = getSrcMod<Src0VT>.ret;
   field Operand Src1Mod = getSrcMod<Src1VT>.ret;
   field Operand Src2Mod = getSrcMod<Src2VT>.ret;
   field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
   field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
-  field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
-  field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
-  field Operand Src0ModSDWA9 = getSrcModSDWA9<Src0VT>.ret;
-  field Operand Src1ModSDWA9 = getSrcModSDWA9<Src1VT>.ret;
+  field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
+  field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
 
 
   field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
@@ -1457,8 +1449,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field dag Outs32 = Outs;
   field dag Outs64 = Outs;
   field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
-  field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
-  field dag OutsSDWA9 = getOutsSDWA9<HasDst, DstVT, DstRCSDWA9>.ret;
+  field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
@@ -1471,11 +1462,9 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
                                HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
-                                 HasModifiers, Src0ModSDWA, Src1ModSDWA,
+                                 HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
                                  DstVT>.ret;
-  field dag InsSDWA9 = getInsSDWA9<Src0SDWA9, Src1SDWA9, NumSrcArgs,
-                                   HasSDWAOMod, Src0ModSDWA9, Src1ModSDWA9,
-                                   DstVT>.ret;
+
 
   field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
   field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;
@@ -1628,13 +1617,13 @@ def getSDWAOp : InstrMapping {
   let ValueCols = [["SDWA"]];
 }
 
-// Maps ordinary instructions to their SDWA GFX9 counterparts
-def getSDWA9Op : InstrMapping {
+// Maps SDWA instructions to their ordinary counterparts
+def getBasicFromSDWAOp : InstrMapping {
   let FilterClass = "VOP";
   let RowFields = ["OpName"];
   let ColFields = ["AsmVariantName"];
-  let KeyCol = ["Default"];
-  let ValueCols = [["SDWA9"]];
+  let KeyCol = ["SDWA"];
+  let ValueCols = [["Default"]];
 }
 
 def getMaskedMIMGOp : InstrMapping {
@@ -1669,7 +1658,9 @@ def getMCOpcodeGen : InstrMapping {
   let ColFields = ["Subtarget"];
   let KeyCol = [!cast<string>(SIEncodingFamily.NONE)];
   let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
-                   [!cast<string>(SIEncodingFamily.VI)]];
+                   [!cast<string>(SIEncodingFamily.VI)],
+                   [!cast<string>(SIEncodingFamily.SDWA)],
+                   [!cast<string>(SIEncodingFamily.SDWA9)]];
 }
 
 // Get equivalent SOPK instruction.
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 18b197ddb7ae7..3203c38dae344 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -74,7 +74,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkItemIDX(false),
     WorkItemIDY(false),
     WorkItemIDZ(false),
-    PrivateMemoryInputPtr(false) {
+    ImplicitBufferPtr(false) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const Function *F = MF.getFunction();
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
@@ -86,6 +86,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
     ScratchWaveOffsetReg = AMDGPU::SGPR4;
     FrameOffsetReg = AMDGPU::SGPR5;
+    StackPtrOffsetReg = AMDGPU::SGPR32;
     return;
   }
 
@@ -150,7 +151,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       DispatchID = true;
   } else if (ST.isMesaGfxShader(MF)) {
     if (HasStackObjects || MaySpill)
-      PrivateMemoryInputPtr = true;
+      ImplicitBufferPtr = true;
   }
 
   // We don't need to worry about accessing spills with flat instructions.
@@ -203,11 +204,11 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
   return FlatScratchInitUserSGPR;
 }
 
-unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
-  PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
+unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
+  ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg(
     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
   NumUserSGPRs += 2;
-  return PrivateMemoryPtrUserSGPR;
+  return ImplicitBufferPtrUserSGPR;
 }
 
 /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9fdb8caac6f21..05aa249584bf1 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -97,7 +97,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned StackPtrOffsetReg;
 
   // Input registers for non-HSA ABI
-  unsigned PrivateMemoryPtrUserSGPR;
+  unsigned ImplicitBufferPtrUserSGPR;
 
   // Input registers setup for the HSA ABI.
   // User SGPRs in allocation order.
@@ -179,7 +179,7 @@ private:
   // Private memory buffer
   // Compute directly in sgpr[0:1]
   // Other shaders indirect 64-bits at sgpr[0:1]
-  bool PrivateMemoryInputPtr : 1;
+  bool ImplicitBufferPtr : 1;
 
   MCPhysReg getNextUserSGPR() const {
     assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
@@ -236,7 +236,7 @@ public:
   unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
   unsigned addDispatchID(const SIRegisterInfo &TRI);
   unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
-  unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);
+  unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI);
 
   // Add system SGPRs.
   unsigned addWorkGroupIDX() {
@@ -341,8 +341,8 @@ public:
     return WorkItemIDZ;
   }
 
-  bool hasPrivateMemoryInputPtr() const {
-    return PrivateMemoryInputPtr;
+  bool hasImplicitBufferPtr() const {
+    return ImplicitBufferPtr;
   }
 
   unsigned getNumUserSGPRs() const {
@@ -396,8 +396,8 @@ public:
     return QueuePtrUserSGPR;
   }
 
-  unsigned getPrivateMemoryPtrUserSGPR() const {
-    return PrivateMemoryPtrUserSGPR;
+  unsigned getImplicitBufferPtrUserSGPR() const {
+    return ImplicitBufferPtrUserSGPR;
   }
 
   bool hasSpilledSGPRs() const {
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index f4ddf1891683b..4ac23ef03cb32 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -67,9 +67,9 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
   void matchSDWAOperands(MachineFunction &MF);
-  bool isConvertibleToSDWA(const MachineInstr &MI) const;
+  bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
-  void legalizeScalarOperands(MachineInstr &MI) const;
+  void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
 
   StringRef getPassName() const override { return "SI Peephole SDWA"; }
 
@@ -224,7 +224,7 @@ static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
 static bool isSubregOf(const MachineOperand &SubReg,
                        const MachineOperand &SuperReg,
                        const TargetRegisterInfo *TRI) {
-  
+
   if (!SuperReg.isReg() || !SubReg.isReg())
     return false;
 
@@ -557,7 +557,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
 
         MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
         MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-      
+
         if (TRI->isPhysicalRegister(Src0->getReg()) ||
             TRI->isPhysicalRegister(Dst->getReg()))
           break;
@@ -590,7 +590,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
           break;
 
         MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-      
+
         if (TRI->isPhysicalRegister(Src1->getReg()) ||
             TRI->isPhysicalRegister(Dst->getReg()))
           break;
@@ -607,16 +607,38 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
   }
 }
 
-bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
+bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
+                                         const SISubtarget &ST) const {
   // Check if this instruction has opcode that supports SDWA
-  unsigned Opc = MI.getOpcode();
-  if (AMDGPU::getSDWAOp(Opc) != -1)
-    return true;
-  int Opc32 = AMDGPU::getVOPe32(Opc);
-  if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1)
-    return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
-           !TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
-  return false;
+  int Opc = MI.getOpcode();
+  if (AMDGPU::getSDWAOp(Opc) == -1)
+    Opc = AMDGPU::getVOPe32(Opc);
+
+  if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1)
+    return false;
+
+  if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+    return false;
+
+  if (TII->isVOPC(Opc)) {
+    if (!ST.hasSDWASdst()) {
+      const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+      if (SDst && SDst->getReg() != AMDGPU::VCC)
+        return false;
+    }
+
+    if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+      return false;
+
+  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+    return false;
+  }
+
+  if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
+                           Opc == AMDGPU::V_MAC_F32_e32))
+    return false;
+
+  return true;
 }
 
 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
@@ -641,6 +663,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
   if (Dst) {
     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
     SDWAInst.add(*Dst);
+  } else {
+    Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+    assert(Dst &&
+           AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
+    SDWAInst.add(*Dst);
   }
 
   // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
@@ -677,9 +704,23 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     SDWAInst.add(*Src2);
   }
 
-  // Initialize clamp.
+  // Copy clamp if present, initialize otherwise
   assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
-  SDWAInst.addImm(0);
+  MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
+  if (Clamp) {
+    SDWAInst.add(*Clamp);
+  } else {
+    SDWAInst.addImm(0);
+  }
+
+  // Copy omod if present, initialize otherwise if needed
+  MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
+  if (OMod) {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1);
+    SDWAInst.add(*OMod);
+  } else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
+    SDWAInst.addImm(0);
+  }
 
   // Initialize dst_sel and dst_unused if present
   if (Dst) {
@@ -733,16 +774,25 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
 }
 
 // If an instruction was converted to SDWA it should not have immediates or SGPR
-// operands. Copy its scalar operands into VGPRs.
-void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
+// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
+void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
-  for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
-    MachineOperand &Op = MI.getOperand(I);
+  unsigned ConstantBusCount = 0;
+  for (MachineOperand &Op: MI.explicit_uses()) {
     if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
       continue;
+
+    unsigned I = MI.getOperandNo(&Op);
     if (Desc.OpInfo[I].RegClass == -1 ||
        !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
       continue;
+
+    if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
+        TRI->isSGPRReg(*MRI, Op.getReg())) {
+      ++ConstantBusCount;
+      continue;
+    }
+
     unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
                         TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
@@ -758,22 +808,20 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 
-  if (!ST.hasSDWA() ||
-      !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
+  if (!ST.hasSDWA())
     return false;
-  }
 
   MRI = &MF.getRegInfo();
   TRI = ST.getRegisterInfo();
   TII = ST.getInstrInfo();
-  
+
   // Find all SDWA operands in MF.
   matchSDWAOperands(MF);
 
   for (const auto &OperandPair : SDWAOperands) {
     const auto &Operand = OperandPair.second;
     MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
-    if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) {
+    if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
       PotentialMatches[PotentialMI].push_back(Operand.get());
     }
   }
@@ -788,7 +836,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
 
   bool Ret = !ConvertedInstructions.empty();
   while (!ConvertedInstructions.empty())
-    legalizeScalarOperands(*ConvertedInstructions.pop_back_val());
+    legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
 
   return Ret;
 }
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index b611f28fcabdf..ef6ad4ad0c8f3 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1044,18 +1044,29 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
           unsigned CarryOut
             = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
           unsigned ScaledReg
-            = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+            = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
-          // XXX - Should this use a vector shift?
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
-            .addReg(DiffReg, RegState::Kill)
-            .addImm(Log2_32(ST.getWavefrontSize()));
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
+            .addImm(Log2_32(ST.getWavefrontSize()))
+            .addReg(DiffReg, RegState::Kill);
 
           // TODO: Fold if use instruction is another add of a constant.
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
-            .addReg(CarryOut, RegState::Define | RegState::Dead)
-            .addImm(Offset)
-            .addReg(ScaledReg, RegState::Kill);
+          if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+              .addReg(CarryOut, RegState::Define | RegState::Dead)
+              .addImm(Offset)
+              .addReg(ScaledReg, RegState::Kill);
+          } else {
+            unsigned ConstOffsetReg
+              = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+              .addImm(Offset);
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+              .addReg(CarryOut, RegState::Define | RegState::Dead)
+              .addReg(ConstOffsetReg, RegState::Kill)
+              .addReg(ScaledReg, RegState::Kill);
+          }
 
           MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC);
         }
@@ -1341,12 +1352,11 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
   case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
     return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
   case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
-    if (ST.isAmdCodeObjectV2(MF)) {
-      assert(MFI->hasPrivateSegmentBuffer());
-      return MFI->PrivateSegmentBufferUserSGPR;
-    }
-    assert(MFI->hasPrivateMemoryInputPtr());
-    return MFI->PrivateMemoryPtrUserSGPR;
+    assert(MFI->hasPrivateSegmentBuffer());
+    return MFI->PrivateSegmentBufferUserSGPR;
+  case SIRegisterInfo::IMPLICIT_BUFFER_PTR:
+    assert(MFI->hasImplicitBufferPtr());
+    return MFI->ImplicitBufferPtrUserSGPR;
   case SIRegisterInfo::KERNARG_SEGMENT_PTR:
     assert(MFI->hasKernargSegmentPtr());
     return MFI->KernargSegmentPtrUserSGPR;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 8fed6d5f9710f..600cc886cb595 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -197,12 +197,13 @@ public:
     WORKGROUP_ID_Y      = 11,
     WORKGROUP_ID_Z      = 12,
     PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+    IMPLICIT_BUFFER_PTR = 15,
 
     // VGPRS:
-    FIRST_VGPR_VALUE    = 15,
+    FIRST_VGPR_VALUE    = 16,
     WORKITEM_ID_X       = FIRST_VGPR_VALUE,
-    WORKITEM_ID_Y       = 16,
-    WORKITEM_ID_Z       = 17
+    WORKITEM_ID_Y       = 17,
+    WORKITEM_ID_Z       = 18
   };
 
   /// \brief Returns the physical register that \p Value is stored in.
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index c5f121757e623..96a18544f02ac 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -92,6 +92,8 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
 
       case AMDGPU::V_ADDC_U32_e64:
       case AMDGPU::V_SUBB_U32_e64:
+        if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm())
+          return false;
         // Additional verification is needed for sdst/src2.
         return true;
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index f581e69980c79..26515b27bb77d 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -538,6 +538,27 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
     Reg == AMDGPU::SCC;
 }
 
+bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
+
+  if (Reg0 == Reg1) {
+    return true;
+  }
+
+  unsigned SubReg0 = TRI->getSubReg(Reg0, 1);
+  if (SubReg0 == 0) {
+    return TRI->getSubRegIndex(Reg1, Reg0) > 0;
+  }
+
+  for (unsigned Idx = 2; SubReg0 > 0; ++Idx) {
+    if (isRegIntersect(Reg1, SubReg0, TRI)) {
+      return true;
+    }
+    SubReg0 = TRI->getSubReg(Reg0, Idx);
+  }
+
+  return false;
+}
+
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
 
   switch(Reg) {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index eff0230d21f57..936e4921a7097 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -271,6 +271,9 @@ bool isGFX9(const MCSubtargetInfo &STI);
 /// \brief Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
 
+/// \brief Is there any intersection between registers
+bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);
+
 /// If \p Reg is a pseudo reg, return the correct hardware register given
 /// \p STI otherwise return \p Reg.
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 95b5ef0a49dba..96b33c373f052 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -93,11 +93,6 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOP1";
 }
 
-class VOP1_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
-  VOP_SDWA9_Pseudo <OpName, P, pattern> {
-  let AsmMatchConverter = "cvtSdwaVOP1";
-}
-
 class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret =
     !if(P.HasModifiers,
@@ -117,7 +112,6 @@ multiclass VOP1Inst <string opName, VOPProfile P,
   def _e32 : VOP1_Pseudo <opName, P>;
   def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
   def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
-  def _sdwa9 : VOP1_SDWA9_Pseudo <opName, P>;
 }
 
 // Special profile for instructions which have clamp
@@ -274,12 +268,10 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
   let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
   let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
   let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
-                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel);
-  let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
-                      clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
-                      src0_sel:$src0_sel);
 
   let Asm32 = getAsm32<1, 1>.ret;
   let Asm64 = getAsm64<1, 1, 0, 1>.ret;
@@ -545,8 +537,8 @@ multiclass VOP1_Real_vi <bits<10> op> {
     VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
   def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9")>,
-    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+    VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
   // For now left dpp only for asm/dasm
   // TODO: add corresponding pseudo
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 657cacaa792ca..7b9bc71ad4c77 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -114,11 +114,6 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOP2";
 }
 
-class VOP2_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
-  VOP_SDWA9_Pseudo <OpName, P, pattern> {
-  let AsmMatchConverter = "cvtSdwaVOP2";
-}
-
 class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret = !if(P.HasModifiers,
     [(set P.DstVT:$vdst,
@@ -139,7 +134,6 @@ multiclass VOP2Inst <string opName,
              Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
 
   def _sdwa  : VOP2_SDWA_Pseudo <opName, P>;
-  def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P>;
 }
 
 multiclass VOP2bInst <string opName,
@@ -156,10 +150,6 @@ multiclass VOP2bInst <string opName,
       def _sdwa  : VOP2_SDWA_Pseudo <opName, P> {
         let AsmMatchConverter = "cvtSdwaVOP2b";
       }
-
-      def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P> {
-        let AsmMatchConverter = "cvtSdwaVOP2b";
-      }
     }
 
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -221,17 +211,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
                     VGPR_32:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      VGPR_32:$src2, // stub argument
-                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     clampmod:$clamp, omod:$omod,
+                     dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
-  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
-                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
-                      VGPR_32:$src2, // stub argument
-                      clampmod:$clamp, omod:$omod,
-                      dst_sel:$dst_sel, dst_unused:$dst_unused,
-                      src0_sel:$src0_sel, src1_sel:$src1_sel);
   let Asm32 = getAsm32<1, 2, vt>.ret;
   let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;
   let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
@@ -289,15 +275,10 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
 
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
-                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     clampmod:$clamp, omod:$omod,
+                     dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
 
-  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
-                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
-                      clampmod:$clamp, omod:$omod,
-                      dst_sel:$dst_sel, dst_unused:$dst_unused,
-                      src0_sel:$src0_sel, src1_sel:$src1_sel);
-
   let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
                     Src1Mod:$src1_modifiers, Src1DPP:$src1,
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
@@ -326,6 +307,8 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
+  let HasExt = 0;
+  let HasSDWA9 = 0;
 }
 
 def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
@@ -335,6 +318,8 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
+  let HasExt = 0;
+  let HasSDWA9 = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -397,20 +382,29 @@ def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
 def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
 } // End isConvergent = 1
 
-defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
-defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32>;
-defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>;
-defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>;
-defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;
+defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
+defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
 
 } // End SubtargetPredicate = isGCN
 
+def : Pat<
+    (AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
+    (V_ADDC_U32_e64 $src0, $src1, $src2)
+>;
+
+def : Pat<
+    (AMDGPUsube i32:$src0, i32:$src1, i1:$src2),
+    (V_SUBB_U32_e64 $src0, $src1, $src2)
+>;
 
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {
@@ -728,8 +722,8 @@ multiclass VOP2_SDWA_Real <bits<6> op> {
 
 multiclass VOP2_SDWA9_Real <bits<6> op> {
   def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9")>,
-    VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+    VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 }
 
 multiclass VOP2be_Real_e32e64_vi <bits<6> op> :
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index cd347b86d3050..f3482a22d5dcd 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -113,11 +113,6 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOPC";
 }
 
-class VOPC_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
-  VOP_SDWA9_Pseudo <OpName, P, pattern> {
-  let AsmMatchConverter = "cvtSdwaVOPC";
-}
-
 // This class is used only with VOPC instructions. Use $sdst for out operand
 class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
   InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {
@@ -189,13 +184,6 @@ multiclass VOPC_Pseudos <string opName,
     let isConvergent = DefExec;
     let isCompare = 1;
   }
-
-  def _sdwa9 : VOPC_SDWA9_Pseudo <opName, P> {
-    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
-    let SchedRW = P.Schedule;
-    let isConvergent = DefExec;
-    let isCompare = 1;
-  }
 }
 
 def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
@@ -540,14 +528,12 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
   VOPC_Profile<sched, vt, i32> {
   let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
   let Asm64 = "$sdst, $src0_modifiers, $src1";
+
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
-  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
-                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
-                      src0_sel:$src0_sel, src1_sel:$src1_sel);
+
   let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
-  //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
   let HasSrc1Mods = 0;
   let HasClamp = 0;
   let HasOMod = 0;
@@ -580,12 +566,6 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {
     let SchedRW = p.Schedule;
     let isConvergent = DefExec;
   }
-
-  def _sdwa9 : VOPC_SDWA9_Pseudo <opName, p> {
-    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
-    let SchedRW = p.Schedule;
-    let isConvergent = DefExec;
-  }
 }
 
 def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
@@ -954,8 +934,8 @@ multiclass VOPC_Real_vi <bits<10> op> {
     VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
   def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9")>,
-    VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+    VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
   def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
                        !cast<Instruction>(NAME#"_e32_vi")> {
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 4da654f84f9d1..e386f21c2ba49 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -232,11 +232,11 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
   let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
   let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
 
-  let Inst{11} = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0)
-  let Inst{12} = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1)
-  let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2)
+  let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
+  let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
+  let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
 
-  let Inst{14} = !if(P.HasOpSel, src2_modifiers{3}, 0); // op_sel_hi(2)
+  let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, 0); // op_sel_hi(2)
 
   let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
 
@@ -245,8 +245,8 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
   let Inst{40-32} = !if(P.HasSrc0, src0, 0);
   let Inst{49-41} = !if(P.HasSrc1, src1, 0);
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
-  let Inst{59}    = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel_hi(0)
-  let Inst{60}    = !if(P.HasOpSel, src1_modifiers{3}, 0); // op_sel_hi(1)
+  let Inst{59}    = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, 0); // op_sel_hi(0)
+  let Inst{60}    = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, 0); // op_sel_hi(1)
   let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
   let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
@@ -300,6 +300,19 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
   let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
 }
 
+// GFX9 adds two features to SDWA:
+// 1.	Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.
+//    a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather
+//       than VGPRs (at most 1 can be an SGPR);
+//    b. OMOD is the standard output modifier (result *2, *4, /2)
+// 2.	Add a new version of the SDWA microcode word for VOPC: SDWAB. This
+//    replaces OMOD and the dest fields with SD and SDST (SGPR destination)
+//    field.
+//    a. When SD=1, the SDST is used as the destination for the compare result;
+//    b. When SD=0, VCC is used.
+//
+// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA
+
 // gfx9 SDWA basic encoding
 class VOP_SDWA9e<VOPProfile P> : Enc64 {
   bits<9> src0; // {src0_sgpr{0}, src0{7-0}}
@@ -353,6 +366,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
 
   string Mnemonic = opName;
   string AsmOperands = P.AsmSDWA;
+  string AsmOperands9 = P.AsmSDWA9;
 
   let Size = 8;
   let mayLoad = 0;
@@ -372,53 +386,9 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   VOPProfile Pfl = P;
 }
 
-// GFX9 adds two features to SDWA:
-// 1.	Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.
-//    a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather
-//       than VGPRs (at most 1 can be an SGPR);
-//    b. OMOD is the standard output modifier (result *2, *4, /2)
-// 2.	Add a new version of the SDWA microcode word for VOPC: SDWAB. This
-//    replaces OMOD and the dest fields with SD and SDST (SGPR destination)
-//    field.
-//    a. When SD=1, the SDST is used as the destination for the compare result;
-//    b.when SD=0, VCC is used.
-// 
-// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA
-
-class VOP_SDWA9_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
-  InstSI <P.OutsSDWA9, P.InsSDWA9, "", pattern>,
-  VOP <opName>,
-  SIMCInstr <opName#"_sdwa9", SIEncodingFamily.NONE>,
-  MnemonicAlias <opName#"_sdwa9", opName> {
-
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let UseNamedOperandTable = 1;
-
-  string Mnemonic = opName;
-  string AsmOperands = P.AsmSDWA9;
-
-  let Size = 8;
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-
-  let VALU = 1;
-  let SDWA = 1;
-  let Uses = [EXEC];
-
-  let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
-  let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
-  let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9,
-                                     AMDGPUAsmVariants.Disable);
-  let DecoderNamespace = "SDWA9";
-
-  VOPProfile Pfl = P;
-}
-
 class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
-  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -443,9 +413,9 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
   let TSFlags              = ps.TSFlags;
 }
 
-class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
-  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []>,
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -458,13 +428,15 @@ class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> :
   let Constraints     = ps.Constraints;
   let DisableEncoding = ps.DisableEncoding;
 
+  let SubtargetPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
+  let AssemblerPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
+  let AsmVariantName = !if(ps.Pfl.HasSDWA9, AMDGPUAsmVariants.SDWA9,
+                                            AMDGPUAsmVariants.Disable);
+  let DecoderNamespace = "SDWA9";
+
   // Copy relevant pseudo op flags
-  let SubtargetPredicate   = ps.SubtargetPredicate;
-  let AssemblerPredicate   = ps.AssemblerPredicate;
   let AsmMatchConverter    = ps.AsmMatchConverter;
-  let AsmVariantName       = ps.AsmVariantName;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
-  let DecoderNamespace     = ps.DecoderNamespace;
   let Constraints          = ps.Constraints;
   let DisableEncoding      = ps.DisableEncoding;
   let TSFlags              = ps.TSFlags;
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index ca68f5d42c32c..6f67183df6a18 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -100,7 +100,8 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
                 "Enable Reliability, Availability and Serviceability extensions">;
 def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true",
                 "Enable fast computation of positive address offsets">;
-
+def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
+                                      "CPU fuses AES crypto operations">;
 
 // Cyclone has preferred instructions for zeroing VFP registers, which can
 // execute in 0 cycles.
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index f9da036c7e468..90f635c812542 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1504,6 +1504,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case ARM::CONSTPOOL_ENTRY: {
+    if (Subtarget->genExecuteOnly())
+      llvm_unreachable("execute-only should not generate constant pools");
+
     /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
     /// in the function.  The first operand is the ID# for this instruction, the
     /// second is the index into the MachineConstantPool that this is, the third
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 8715657ad5e25..e0810c358f2da 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -665,12 +665,14 @@ bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const {
   const ARMFunctionInfo *AFI =
       MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
 
+  // Neon instructions in Thumb2 IT blocks are deprecated, see ARMARM.
+  // In their ARM encoding, they can't be encoded in a conditional form.
+  if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+    return false;
+
   if (AFI->isThumb2Function()) {
     if (getSubtarget().restrictIT())
       return isV8EligibleForIT(&MI);
-  } else { // non-Thumb
-    if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
-      return false;
   }
 
   return true;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 5b2d093e8f0da..2bcc707e9fc3c 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -2669,12 +2669,35 @@ static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
 // into MOVi.
-static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
+                                             SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   // FIXME there is no actual debug info here
   SDLoc dl(Op);
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   SDValue Res;
+
+  // When generating execute-only code Constant Pools must be promoted to the
+  // global data section. It's a bit ugly that we can't share them across basic
+  // blocks, but this way we guarantee that execute-only behaves correct with
+  // position-independent addressing modes.
+  if (Subtarget->genExecuteOnly()) {
+    auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
+    auto T = const_cast<Type*>(CP->getType());
+    auto C = const_cast<Constant*>(CP->getConstVal());
+    auto M = const_cast<Module*>(DAG.getMachineFunction().
+                                 getFunction()->getParent());
+    auto GV = new GlobalVariable(
+                    *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C,
+                    Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
+                    Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
+                    Twine(AFI->createPICLabelUId())
+                  );
+    SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
+                                            dl, PtrVT);
+    return LowerGlobalAddress(GA, DAG);
+  }
+
   if (CP->isMachineConstantPoolEntry())
     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
                                     CP->getAlignment());
@@ -3118,6 +3141,19 @@ static bool isReadOnly(const GlobalValue *GV) {
          isa<Function>(GV);
 }
 
+SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Subtarget->getTargetTriple().getObjectFormat()) {
+  default: llvm_unreachable("unknown object format");
+  case Triple::COFF:
+    return LowerGlobalAddressWindows(Op, DAG);
+  case Triple::ELF:
+    return LowerGlobalAddressELF(Op, DAG);
+  case Triple::MachO:
+    return LowerGlobalAddressDarwin(Op, DAG);
+  }
+}
+
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -7634,21 +7670,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
-  case ISD::ConstantPool:
-    if (Subtarget->genExecuteOnly())
-      llvm_unreachable("execute-only should not generate constant pools");
-    return LowerConstantPool(Op, DAG);
+  case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
-  case ISD::GlobalAddress:
-    switch (Subtarget->getTargetTriple().getObjectFormat()) {
-    default: llvm_unreachable("unknown object format");
-    case Triple::COFF:
-      return LowerGlobalAddressWindows(Op, DAG);
-    case Triple::ELF:
-      return LowerGlobalAddressELF(Op, DAG);
-    case Triple::MachO:
-      return LowerGlobalAddressDarwin(Op, DAG);
-    }
+  case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:        return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 26da528c19e6d..5044134f5b1e2 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -601,6 +601,8 @@ class InstrItineraryData;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                     const ARMSubtarget *Subtarget) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 45471a4e95b39..53db5acbe805c 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -4756,6 +4756,16 @@ def t2MOVsr: t2AsmPseudo<"mov${p} $Rd, $shift",
 def t2MOVSsr: t2AsmPseudo<"movs${p} $Rd, $shift",
                           (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
 
+// Aliases for the above with the .w qualifier
+def : t2InstAlias<"mov${p}.w $Rd, $shift",
+                  (t2MOVsi rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def : t2InstAlias<"movs${p}.w $Rd, $shift",
+                  (t2MOVSsi rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def : t2InstAlias<"mov${p}.w $Rd, $shift",
+                  (t2MOVsr rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+def : t2InstAlias<"movs${p}.w $Rd, $shift",
+                  (t2MOVSsr rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+
 // ADR w/o the .w suffix
 def : t2InstAlias<"adr${p} $Rd, $addr",
                   (t2ADR rGPR:$Rd, t2adrlabel:$addr, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 2ae3bad4076b0..4cb0eca5ee5f8 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -42,6 +42,10 @@ public:
 private:
   bool selectImpl(MachineInstr &I) const;
 
+  bool selectICmp(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
+                  MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                  const RegisterBankInfo &RBI) const;
+
   const ARMBaseInstrInfo &TII;
   const ARMBaseRegisterInfo &TRI;
   const ARMBaseTargetMachine &TM;
@@ -243,6 +247,105 @@ static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
   return Opc;
 }
 
+static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) {
+  switch (Pred) {
+  // Needs two compares...
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UEQ:
+  default:
+    // AL is our "false" for now. The other two need more compares.
+    return ARMCC::AL;
+  case CmpInst::ICMP_EQ:
+  case CmpInst::FCMP_OEQ:
+    return ARMCC::EQ;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::FCMP_OGT:
+    return ARMCC::GT;
+  case CmpInst::ICMP_SGE:
+  case CmpInst::FCMP_OGE:
+    return ARMCC::GE;
+  case CmpInst::ICMP_UGT:
+  case CmpInst::FCMP_UGT:
+    return ARMCC::HI;
+  case CmpInst::FCMP_OLT:
+    return ARMCC::MI;
+  case CmpInst::ICMP_ULE:
+  case CmpInst::FCMP_OLE:
+    return ARMCC::LS;
+  case CmpInst::FCMP_ORD:
+    return ARMCC::VC;
+  case CmpInst::FCMP_UNO:
+    return ARMCC::VS;
+  case CmpInst::FCMP_UGE:
+    return ARMCC::PL;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::FCMP_ULT:
+    return ARMCC::LT;
+  case CmpInst::ICMP_SLE:
+  case CmpInst::FCMP_ULE:
+    return ARMCC::LE;
+  case CmpInst::FCMP_UNE:
+  case CmpInst::ICMP_NE:
+    return ARMCC::NE;
+  case CmpInst::ICMP_UGE:
+    return ARMCC::HS;
+  case CmpInst::ICMP_ULT:
+    return ARMCC::LO;
+  }
+}
+
+bool ARMInstructionSelector::selectICmp(MachineInstrBuilder &MIB,
+                                        const ARMBaseInstrInfo &TII,
+                                        MachineRegisterInfo &MRI,
+                                        const TargetRegisterInfo &TRI,
+                                        const RegisterBankInfo &RBI) const {
+  auto &MBB = *MIB->getParent();
+  auto InsertBefore = std::next(MIB->getIterator());
+  auto &DebugLoc = MIB->getDebugLoc();
+
+  // Move 0 into the result register.
+  auto Mov0I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVi))
+                   .addDef(MRI.createVirtualRegister(&ARM::GPRRegClass))
+                   .addImm(0)
+                   .add(predOps(ARMCC::AL))
+                   .add(condCodeOp());
+  if (!constrainSelectedInstRegOperands(*Mov0I, TII, TRI, RBI))
+    return false;
+
+  // Perform the comparison.
+  auto LHSReg = MIB->getOperand(2).getReg();
+  auto RHSReg = MIB->getOperand(3).getReg();
+  assert(MRI.getType(LHSReg) == MRI.getType(RHSReg) &&
+         MRI.getType(LHSReg).getSizeInBits() == 32 &&
+         MRI.getType(RHSReg).getSizeInBits() == 32 &&
+         "Unsupported types for comparison operation");
+  auto CmpI = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::CMPrr))
+                  .addUse(LHSReg)
+                  .addUse(RHSReg)
+                  .add(predOps(ARMCC::AL));
+  if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI))
+    return false;
+
+  // Move 1 into the result register if the flags say so.
+  auto ResReg = MIB->getOperand(0).getReg();
+  auto Cond =
+      static_cast<CmpInst::Predicate>(MIB->getOperand(1).getPredicate());
+  auto ARMCond = getComparePred(Cond);
+  if (ARMCond == ARMCC::AL)
+    return false;
+
+  auto Mov1I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVCCi))
+                   .addDef(ResReg)
+                   .addUse(Mov0I->getOperand(0).getReg())
+                   .addImm(1)
+                   .add(predOps(ARMCond, ARM::CPSR));
+  if (!constrainSelectedInstRegOperands(*Mov1I, TII, TRI, RBI))
+    return false;
+
+  MIB->eraseFromParent();
+  return true;
+}
+
 bool ARMInstructionSelector::select(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -343,6 +446,8 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     I.setDesc(TII.get(COPY));
     return selectCopy(I, TII, MRI, TRI, RBI);
   }
+  case G_ICMP:
+    return selectICmp(MIB, TII, MRI, TRI, RBI);
   case G_GEP:
     I.setDesc(TII.get(ARM::ADDrr));
     MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index a706079d98662..5873c7fb38729 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -86,6 +86,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
 
   setAction({G_CONSTANT, s32}, Legal);
 
+  setAction({G_ICMP, s1}, Legal);
+  for (auto Ty : {s8, s16})
+    setAction({G_ICMP, 1, Ty}, WidenScalar);
+  for (auto Ty : {s32, p0})
+    setAction({G_ICMP, 1, Ty}, Legal);
+
   if (!ST.useSoftFloat() && ST.hasVFP2()) {
     setAction({G_FADD, s32}, Legal);
     setAction({G_FADD, s64}, Legal);
diff --git a/lib/Target/ARM/ARMMacroFusion.cpp b/lib/Target/ARM/ARMMacroFusion.cpp
new file mode 100644
index 0000000000000..1b6e97c28d453
--- /dev/null
+++ b/lib/Target/ARM/ARMMacroFusion.cpp
@@ -0,0 +1,57 @@
+//===- ARMMacroFusion.cpp - ARM Macro Fusion ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the ARM implementation of the DAG scheduling
+///  mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMacroFusion.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(TSI);
+
+  // Assume wildcards for unspecified instrs.
+  unsigned FirstOpcode =
+    FirstMI ? FirstMI->getOpcode()
+	    : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
+
+  if (ST.hasFuseAES())
+    // Fuse AES crypto operations.
+    switch(SecondOpcode) {
+    // AES encode.
+    case ARM::AESMC :
+      return FirstOpcode == ARM::AESE ||
+             FirstOpcode == ARM::INSTRUCTION_LIST_END;
+    // AES decode.
+    case ARM::AESIMC:
+      return FirstOpcode == ARM::AESD ||
+             FirstOpcode == ARM::INSTRUCTION_LIST_END;
+    }
+
+  return false;
+}
+
+std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation () {
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/lib/Target/ARM/ARMMacroFusion.h b/lib/Target/ARM/ARMMacroFusion.h
new file mode 100644
index 0000000000000..1e4fc6687eae8
--- /dev/null
+++ b/lib/Target/ARM/ARMMacroFusion.h
@@ -0,0 +1,24 @@
+//===- ARMMacroFusion.h - ARM Macro Fusion ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the ARM definition of the DAG scheduling mutation
+/// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createARMMacroFusionDAGMutation());
+/// to ARMPassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation();
+
+} // llvm
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index f59b075e6dd9a..2350d0c6ef69e 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -255,6 +255,16 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OperandsMapping =
         getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
     break;
+  case G_ICMP: {
+    LLT Ty2 = MRI.getType(MI.getOperand(2).getReg());
+    (void)Ty2;
+    assert(Ty2.getSizeInBits() == 32 && "Unsupported size for G_ICMP");
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr,
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx]});
+    break;
+  }
   case G_MERGE_VALUES: {
     // We only support G_MERGE_VALUES for creating a double precision floating
     // point value out of two GPRs.
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index af682dd8321cf..d890d0fa777e8 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -285,6 +285,10 @@ protected:
   /// HasFPAO - if true, processor  does positive address offset computation faster
   bool HasFPAO = false;
 
+  /// HasFuseAES - if true, processor executes back to back AES instruction
+  /// pairs faster.
+  bool HasFuseAES = false;
+
   /// If true, if conversion may decide to leave some instructions unpredicated.
   bool IsProfitableToUnpredicate = false;
 
@@ -561,6 +565,10 @@ public:
   bool hasD16() const { return HasD16; }
   bool hasFullFP16() const { return HasFullFP16; }
 
+  bool hasFuseAES() const { return HasFuseAES; }
+  /// \brief Return true if the CPU supports any kind of instruction fusion.
+  bool hasFusion() const { return hasFuseAES(); }
+
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index c0506cfda6129..eb71e557ec917 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -17,6 +17,7 @@
 #include "ARMRegisterBankInfo.h"
 #endif
 #include "ARMSubtarget.h"
+#include "ARMMacroFusion.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
 #include "ARMTargetTransformInfo.h"
@@ -394,6 +395,9 @@ public:
   createMachineScheduler(MachineSchedContext *C) const override {
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     // add DAG Mutations here.
+    const ARMSubtarget &ST = C->MF->getSubtarget<ARMSubtarget>();
+    if (ST.hasFusion())
+      DAG->addMutation(createARMMacroFusionDAGMutation());
     return DAG;
   }
 
@@ -401,6 +405,9 @@ public:
   createPostMachineScheduler(MachineSchedContext *C) const override {
     ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
     // add DAG Mutations here.
+    const ARMSubtarget &ST = C->MF->getSubtarget<ARMSubtarget>();
+    if (ST.hasFusion())
+      DAG->addMutation(createARMMacroFusionDAGMutation());
     return DAG;
   }
 
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 19fba3033bb2b..891b5c60e1fd6 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -6860,6 +6860,17 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
 bool ARMAsmParser::processInstruction(MCInst &Inst,
                                       const OperandVector &Operands,
                                       MCStreamer &Out) {
+  // Check if we have the wide qualifier, because if it's present we
+  // must avoid selecting a 16-bit thumb instruction.
+  bool HasWideQualifier = false;
+  for (auto &Op : Operands) {
+    ARMOperand &ARMOp = static_cast<ARMOperand&>(*Op);
+    if (ARMOp.isToken() && ARMOp.getToken() == ".w") {
+      HasWideQualifier = true;
+      break;
+    }
+  }
+
   switch (Inst.getOpcode()) {
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
@@ -6939,8 +6950,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // Select the narrow version if the immediate will fit.
     if (Inst.getOperand(1).getImm() > 0 &&
         Inst.getOperand(1).getImm() <= 0xff &&
-        !(static_cast<ARMOperand &>(*Operands[2]).isToken() &&
-          static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w"))
+        !HasWideQualifier)
       Inst.setOpcode(ARM::tLDRpci);
     else
       Inst.setOpcode(ARM::t2LDRpci);
@@ -6971,10 +6981,9 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     else if (Inst.getOpcode() == ARM::t2LDRConstPool)
       TmpInst.setOpcode(ARM::t2LDRpci);
     const ARMOperand &PoolOperand =
-      (static_cast<ARMOperand &>(*Operands[2]).isToken() &&
-       static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w") ?
-      static_cast<ARMOperand &>(*Operands[4]) :
-      static_cast<ARMOperand &>(*Operands[3]);
+      (HasWideQualifier ?
+       static_cast<ARMOperand &>(*Operands[4]) :
+       static_cast<ARMOperand &>(*Operands[3]));
     const MCExpr *SubExprVal = PoolOperand.getConstantPoolImm();
     // If SubExprVal is a constant we may be able to use a MOV
     if (isa<MCConstantExpr>(SubExprVal) &&
@@ -8117,8 +8126,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
-        !(static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-          static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) {
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
       default: llvm_unreachable("unexpected opcode");
@@ -8152,7 +8160,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         isARMLowRegister(Inst.getOperand(2).getReg()) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
-        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr))
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr) &&
+        !HasWideQualifier)
       isNarrow = true;
     MCInst TmpInst;
     unsigned newOpc;
@@ -8186,7 +8195,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     bool isNarrow = false;
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
-        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi))
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi) &&
+        !HasWideQualifier)
       isNarrow = true;
     MCInst TmpInst;
     unsigned newOpc;
@@ -8415,10 +8425,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
         !isARMLowRegister(Inst.getOperand(0).getReg()) ||
         (Inst.getOperand(2).isImm() &&
          (unsigned)Inst.getOperand(2).getImm() > 255) ||
-        ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
-        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+        Inst.getOperand(5).getReg() != (inITBlock() ? 0 : ARM::CPSR) ||
+        HasWideQualifier)
       break;
     MCInst TmpInst;
     TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
@@ -8447,8 +8455,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     }
     if (!Transform ||
         Inst.getOperand(5).getReg() != 0 ||
-        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+        HasWideQualifier)
       break;
     MCInst TmpInst;
     TmpInst.setOpcode(ARM::tADDhirr);
@@ -8568,11 +8575,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         (Inst.getOperand(1).isImm() &&
          (unsigned)Inst.getOperand(1).getImm() <= 255) &&
-        ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
-          Inst.getOperand(4).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
-        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
-         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+        Inst.getOperand(4).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !HasWideQualifier) {
       // The operands aren't in the same order for tMOVi8...
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::tMOVi8);
@@ -8593,8 +8597,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(2).getImm() == ARMCC::AL &&
         Inst.getOperand(4).getReg() == ARM::CPSR &&
-        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
-         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+        !HasWideQualifier) {
       // The operands aren't the same for tMOV[S]r... (no cc_out)
       MCInst TmpInst;
       TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
@@ -8616,8 +8619,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(2).getImm() == 0 &&
-        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
-         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
       default: llvm_unreachable("Illegal opcode!");
@@ -8716,11 +8718,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
-        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
-        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
-         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
-             ".w"))) {
+        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
         default: llvm_unreachable("unexpected opcode");
@@ -8756,11 +8755,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
          Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
-        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
-        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
-         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
-             ".w"))) {
+        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
         default: llvm_unreachable("unexpected opcode");
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 3cde43967568b..cf6827fd6ca19 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -49,6 +49,7 @@ add_llvm_target(ARMCodeGen
   ARMLoadStoreOptimizer.cpp
   ARMMCInstLower.cpp
   ARMMachineFunctionInfo.cpp
+  ARMMacroFusion.cpp
   ARMRegisterInfo.cpp
   ARMOptimizeBarriersPass.cpp
   ARMSelectionDAGInfo.cpp
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 716492ea25662..81760f03940ad 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -358,11 +358,27 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
   return Value;
 }
 
-unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
+                                         const MCFixup &Fixup,
+                                         const MCValue &Target, uint64_t Value,
                                          bool IsPCRel, MCContext &Ctx,
                                          bool IsLittleEndian,
                                          bool IsResolved) const {
   unsigned Kind = Fixup.getKind();
+
+  // MachO tries to make .o files that look vaguely pre-linked, so for MOVW/MOVT
+  // and .word relocations they put the Thumb bit into the addend if possible.
+  // Other relocation types don't want this bit though (branches couldn't encode
+  // it if it *was* present, and no other relocations exist) and it can
+  // interfere with checking valid expressions.
+  if (const MCSymbolRefExpr *A = Target.getSymA()) {
+    if (A->hasSubsectionsViaSymbols() && Asm.isThumbFunc(&A->getSymbol()) &&
+        (Kind == FK_Data_4 || Kind == ARM::fixup_arm_movw_lo16 ||
+         Kind == ARM::fixup_arm_movt_hi16 || Kind == ARM::fixup_t2_movw_lo16 ||
+         Kind == ARM::fixup_t2_movt_hi16))
+      Value |= 1;
+  }
+
   switch (Kind) {
   default:
     Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type");
@@ -505,6 +521,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     return swapHalfWords(out, IsLittleEndian);
   }
   case ARM::fixup_arm_thumb_bl: {
+    // FIXME: We get both thumb1 and thumb2 in here, so we can only check for
+    // the less strict thumb2 value.
+    if (!isInt<26>(Value - 4)) {
+      Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
+      return 0;
+    }
+
     // The value doesn't encode the low bit (always zero) and is offset by
     // four. The 32-bit immediate value is encoded as
     //   imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
@@ -716,29 +739,11 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 }
 
 void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                      const MCAsmLayout &Layout,
                                       const MCFixup &Fixup,
-                                      const MCFragment *DF,
-                                      const MCValue &Target, uint64_t &Value,
-                                      bool &IsResolved) {
+                                      const MCValue &Target, bool &IsResolved) {
   const MCSymbolRefExpr *A = Target.getSymA();
   const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
   const unsigned FixupKind = Fixup.getKind() ;
-  // MachO (the only user of "Value") tries to make .o files that look vaguely
-  // pre-linked, so for MOVW/MOVT and .word relocations they put the Thumb bit
-  // into the addend if possible. Other relocation types don't want this bit
-  // though (branches couldn't encode it if it *was* present, and no other
-  // relocations exist) and it can interfere with checking valid expressions.
-  if (FixupKind == FK_Data_4 ||
-      FixupKind == ARM::fixup_arm_movw_lo16 ||
-      FixupKind == ARM::fixup_arm_movt_hi16 ||
-      FixupKind == ARM::fixup_t2_movw_lo16 ||
-      FixupKind == ARM::fixup_t2_movt_hi16) {
-    if (Sym) {
-      if (Asm.isThumbFunc(Sym))
-        Value |= 1;
-    }
-  }
   if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
     assert(Sym && "How did we resolve this?");
 
@@ -747,7 +752,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
 
     // If the symbol is out of range, produce a relocation and hope the
     // linker can handle it. GNU AS produces an error in this case.
-    if (Sym->isExternal() || Value >= 0x400004)
+    if (Sym->isExternal())
       IsResolved = false;
   }
   // Create relocations for unconditional branches to function symbols with
@@ -759,6 +764,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
         IsResolved = false;
       if (!Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_thumb_br ||
                                     FixupKind == ARM::fixup_arm_thumb_bl ||
+                                    FixupKind == ARM::fixup_t2_condbranch ||
                                     FixupKind == ARM::fixup_t2_uncondbranch))
         IsResolved = false;
     }
@@ -875,22 +881,25 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
   }
 }
 
-void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value, bool IsPCRel,
-                               MCContext &Ctx) const {
+void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target,
+                               MutableArrayRef<char> Data, uint64_t Value,
+                               bool IsPCRel) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value = adjustFixupValue(Fixup, Value, IsPCRel, Ctx, IsLittleEndian, true);
+  MCContext &Ctx = Asm.getContext();
+  Value = adjustFixupValue(Asm, Fixup, Target, Value, IsPCRel, Ctx,
+                           IsLittleEndian, true);
   if (!Value)
     return; // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // Used to point to big endian bytes.
   unsigned FullSizeBytes;
   if (!IsLittleEndian) {
     FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
-    assert((Offset + FullSizeBytes) <= DataSize && "Invalid fixup size!");
+    assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!");
     assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
   }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 2ddedb5d61059..6a0ba2ed41c1a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -40,17 +40,17 @@ public:
 
   /// processFixupValue - Target hook to process the literal value of a fixup
   /// if necessary.
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
+  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                         const MCValue &Target, bool &IsResolved) override;
 
-  unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel,
+  unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                            const MCValue &Target, uint64_t Value, bool IsPCRel,
                             MCContext &Ctx, bool IsLittleEndian,
                             bool IsResolved) const;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   unsigned getRelaxedOpcode(unsigned Op) const;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 00505a103e00f..f74fb2e20b5a3 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -33,8 +33,8 @@ public:
 
   ~ARMWinCOFFObjectWriter() override = default;
 
-  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsCrossSection,
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsCrossSection,
                         const MCAsmBackend &MAB) const override;
 
   bool recordRelocation(const MCFixup &) const override;
@@ -42,7 +42,8 @@ public:
 
 } // end anonymous namespace
 
-unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
+unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+                                              const MCValue &Target,
                                               const MCFixup &Fixup,
                                               bool IsCrossSection,
                                               const MCAsmBackend &MAB) const {
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 7d5fb6ca17b98..c6ddd6bdad5e6 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -214,7 +214,12 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
     if (Opcode != ISD::LOAD)
       continue;
 
-    unsigned char new_val[8]; // hold up the constant values replacing loads.
+    union {
+      uint8_t c[8];
+      uint16_t s;
+      uint32_t i;
+      uint64_t d;
+    } new_val; // hold up the constant values replacing loads.
     bool to_replace = false;
     SDLoc DL(Node);
     const LoadSDNode *LD = cast<LoadSDNode>(Node);
@@ -242,7 +247,7 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
       const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode());
       if (GADN && CDN)
         to_replace =
-            getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val);
+            getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
     } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
                LDAddrNode->getNumOperands() > 0) {
       DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
@@ -250,7 +255,7 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
       SDValue OP1 = LDAddrNode->getOperand(0);
       if (const GlobalAddressSDNode *GADN =
               dyn_cast<GlobalAddressSDNode>(OP1.getNode()))
-        to_replace = getConstantFieldValue(GADN, 0, size, new_val);
+        to_replace = getConstantFieldValue(GADN, 0, size, new_val.c);
     }
 
     if (!to_replace)
@@ -259,13 +264,13 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
     // replacing the old with a new value
     uint64_t val;
     if (size == 1)
-      val = *(uint8_t *)new_val;
+      val = new_val.c[0];
     else if (size == 2)
-      val = *(uint16_t *)new_val;
+      val = new_val.s;
     else if (size == 4)
-      val = *(uint32_t *)new_val;
+      val = new_val.i;
     else {
-      val = *(uint64_t *)new_val;
+      val = new_val.d;
     }
 
     DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
@@ -318,14 +323,17 @@ bool BPFDAGToDAGISel::getConstantFieldValue(const GlobalAddressSDNode *Node,
   }
 
   // test whether host endianness matches target
-  uint8_t test_buf[2];
+  union {
+    uint8_t c[2];
+    uint16_t s;
+  } test_buf;
   uint16_t test_val = 0x2345;
   if (DL.isLittleEndian())
-    support::endian::write16le(test_buf, test_val);
+    support::endian::write16le(test_buf.c, test_val);
   else
-    support::endian::write16be(test_buf, test_val);
+    support::endian::write16be(test_buf.c, test_val);
 
-  bool endian_match = *(uint16_t *)test_buf == test_val;
+  bool endian_match = test_buf.s == test_val;
   for (uint64_t i = Offset, j = 0; i < Offset + Size; i++, j++)
     ByteSeq[j] = endian_match ? TmpVal[i] : TmpVal[Offset + Size - 1 - j];
 
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 80357a63a4e12..15e89fb2a2611 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -27,8 +27,9 @@ public:
     : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
   ~BPFAsmBackend() override = default;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -61,9 +62,10 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value, bool IsPCRel,
-                               MCContext &Ctx) const {
+void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target,
+                               MutableArrayRef<char> Data, uint64_t Value,
+                               bool IsPCRel) const {
   if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
     assert(Value == 0);
   } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 9f8c9ded8127b..734f3c6658d92 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -567,8 +567,19 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
     }
     llvm_unreachable("Invalid register operand");
   }
-  if (SO.isImm() || SO.isFPImm())
-    return IfTrue ? C2_cmoveit : C2_cmoveif;
+  switch (SO.getType()) {
+    case MachineOperand::MO_Immediate:
+    case MachineOperand::MO_FPImmediate:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_TargetIndex:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ExternalSymbol:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_BlockAddress:
+      return IfTrue ? C2_cmoveit : C2_cmoveif;
+    default:
+      break;
+  }
   llvm_unreachable("Unexpected source operand");
 }
 
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 18e49c69b8e36..2b0ceaa66258e 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1051,10 +1051,26 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   bool HasExtraAlign = HRI.needsStackRealignment(MF);
   bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
 
+  unsigned FrameSize = MFI.getStackSize();
   unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister();
   auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
   unsigned AP = HMFI.getStackAlignBasePhysReg();
-  unsigned FrameSize = MFI.getStackSize();
+  // It may happen that AP will be absent even HasAlloca && HasExtraAlign
+  // is true. HasExtraAlign may be set because of vector spills, without
+  // aligned locals or aligned outgoing function arguments. Since vector
+  // spills will ultimately be "unaligned", it is safe to use FP as the
+  // base register.
+  // In fact, in such a scenario the stack is actually not required to be
+  // aligned, although it may end up being aligned anyway, since this
+  // particular case is not easily detectable. The alignment will be
+  // unnecessary, but not incorrect.
+  // Unfortunately there is no quick way to verify that the above is
+  // indeed the case (and that it's not a result of an error), so just
+  // assume that missing AP will be replaced by FP.
+  // (A better fix would be to rematerialize AP from FP and always align
+  // vector spills.)
+  if (AP == 0)
+    AP = FP;
 
   bool UseFP = false, UseAP = false;  // Default: use SP (except at -O0).
   // Use FP at -O0, except when there are objects with extra alignment.
@@ -2454,9 +2470,44 @@ bool HexagonFrameLowering::mayOverflowFrameOffset(MachineFunction &MF) const {
   unsigned StackSize = MF.getFrameInfo().estimateStackSize(MF);
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   // A fairly simplistic guess as to whether a potential load/store to a
-  // stack location could require an extra register. It does not account
-  // for store-immediate instructions.
-  if (HST.useHVXOps())
-    return StackSize > 256;
+  // stack location could require an extra register.
+  if (HST.useHVXOps() && StackSize > 256)
+    return true;
+
+  // Check if the function has store-immediate instructions that access
+  // the stack. Since the offset field is not extendable, if the stack
+  // size exceeds the offset limit (6 bits, shifted), the stores will
+  // require a new base register.
+  bool HasImmStack = false;
+  unsigned MinLS = ~0u;   // Log_2 of the memory access size.
+
+  for (const MachineBasicBlock &B : MF) {
+    for (const MachineInstr &MI : B) {
+      unsigned LS = 0;
+      switch (MI.getOpcode()) {
+        case Hexagon::S4_storeirit_io:
+        case Hexagon::S4_storeirif_io:
+        case Hexagon::S4_storeiri_io:
+          ++LS;
+          LLVM_FALLTHROUGH;
+        case Hexagon::S4_storeirht_io:
+        case Hexagon::S4_storeirhf_io:
+        case Hexagon::S4_storeirh_io:
+          ++LS;
+          LLVM_FALLTHROUGH;
+        case Hexagon::S4_storeirbt_io:
+        case Hexagon::S4_storeirbf_io:
+        case Hexagon::S4_storeirb_io:
+          if (MI.getOperand(0).isFI())
+            HasImmStack = true;
+          MinLS = std::min(MinLS, LS);
+          break;
+      }
+    }
+  }
+
+  if (HasImmStack)
+    return !isUInt<6>(StackSize >> MinLS);
+
   return false;
 }
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index 11ac5454f6043..5abbcbba72ddd 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -295,15 +296,12 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0;
     unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0;
     bool Failure = false, CanUp = true, CanDown = true;
-    bool Used1 = false, Used2 = false;
     for (unsigned X = MinX+1; X < MaxX; X++) {
       const DefUseInfo &DU = DUM.lookup(X);
       if (DU.Defs[PR] || DU.Defs[DR] || DU.Uses[DR]) {
         Failure = true;
         break;
       }
-      Used1 |= DU.Uses[SR1];
-      Used2 |= DU.Uses[SR2];
       if (CanDown && DU.Defs[SR1])
         CanDown = false;
       if (CanUp && DU.Defs[SR2])
@@ -317,64 +315,52 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     // Prefer "down", since this will move the MUX farther away from the
     // predicate definition.
     MachineBasicBlock::iterator At = CanDown ? Def2 : Def1;
-    if (CanDown) {
-      // If the MUX is placed "down", we need to make sure that there aren't
-      // any kills of the source registers between the two defs.
-      if (Used1 || Used2) {
-        auto ResetKill = [this] (unsigned Reg, MachineInstr &MI) -> bool {
-          if (MachineOperand *Op = MI.findRegisterUseOperand(Reg, true, HRI)) {
-            Op->setIsKill(false);
-            return true;
-          }
-          return false;
-        };
-        bool KilledSR1 = false, KilledSR2 = false;
-        for (MachineInstr &MJ : make_range(std::next(It1), It2)) {
-          if (SR1)
-            KilledSR1 |= ResetKill(SR1, MJ);
-          if (SR2)
-            KilledSR2 |= ResetKill(SR1, MJ);
-        }
-        // If any of the source registers were killed in this range, transfer
-        // the kills to the source operands: they will me "moved" to the
-        // resulting MUX and their parent instructions will be deleted.
-        if (KilledSR1) {
-          assert(Src1->isReg());
-          Src1->setIsKill(true);
-        }
-        if (KilledSR2) {
-          assert(Src2->isReg());
-          Src2->setIsKill(true);
-        }
-      }
-    } else {
-      // If the MUX is placed "up", it shouldn't kill any source registers
-      // that are still used afterwards. We can reset the kill flags directly
-      // on the operands, because the source instructions will be erased.
-      if (Used1 && Src1->isReg())
-        Src1->setIsKill(false);
-      if (Used2 && Src2->isReg())
-        Src2->setIsKill(false);
-    }
     ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2));
   }
 
-  for (unsigned I = 0, N = ML.size(); I < N; ++I) {
-    MuxInfo &MX = ML[I];
-    MachineBasicBlock &B = *MX.At->getParent();
-    DebugLoc DL = MX.At->getDebugLoc();
+  for (MuxInfo &MX : ML) {
     unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF);
     if (!MxOpc)
       continue;
-    BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
-        .addReg(MX.PredR)
-        .add(*MX.SrcT)
-        .add(*MX.SrcF);
+    MachineBasicBlock &B = *MX.At->getParent();
+    const DebugLoc &DL = B.findDebugLoc(MX.At);
+    auto NewMux = BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
+                      .addReg(MX.PredR)
+                      .add(*MX.SrcT)
+                      .add(*MX.SrcF);
+    NewMux->clearKillInfo();
     B.erase(MX.Def1);
     B.erase(MX.Def2);
     Changed = true;
   }
 
+  // Fix up kill flags.
+
+  LivePhysRegs LPR(*HRI);
+  LPR.addLiveOuts(B);
+  auto IsLive = [&LPR,this] (unsigned Reg) -> bool {
+    for (MCSubRegIterator S(Reg, HRI, true); S.isValid(); ++S)
+      if (LPR.contains(*S))
+        return true;
+    return false;
+  };
+  for (auto I = B.rbegin(), E = B.rend(); I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+    // This isn't 100% accurate, but it's safe.
+    // It won't detect (as a kill) a case like this
+    //   r0 = add r0, 1    <-- r0 should be "killed"
+    //   ... = r0
+    for (MachineOperand &Op : I->operands()) {
+      if (!Op.isReg() || !Op.isUse())
+        continue;
+      assert(Op.getSubReg() == 0 && "Should have physical registers only");
+      bool Live = IsLive(Op.getReg());
+      Op.setIsKill(!Live);
+    }
+    LPR.stepBackward(*I);
+  }
+
   return Changed;
 }
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index f43101fa456d5..fec2dc5ce3066 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -94,10 +94,6 @@ static cl::opt<bool> UseDFAHazardRec("dfa-hazard-rec",
 ///
 /// Constants for Hexagon instructions.
 ///
-const int Hexagon_MEMV_OFFSET_MAX_128B = 896;   // #s4: -8*128...7*128
-const int Hexagon_MEMV_OFFSET_MIN_128B = -1024; // #s4
-const int Hexagon_MEMV_OFFSET_MAX = 448;  // #s4: -8*64...7*64
-const int Hexagon_MEMV_OFFSET_MIN = -512; // #s4
 const int Hexagon_MEMW_OFFSET_MAX = 4095;
 const int Hexagon_MEMW_OFFSET_MIN = -4096;
 const int Hexagon_MEMD_OFFSET_MAX = 8191;
@@ -2443,8 +2439,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::V6_vS32b_ai:
   case Hexagon::V6_vL32Ub_ai:
   case Hexagon::V6_vS32Ub_ai:
-    return (Offset >= Hexagon_MEMV_OFFSET_MIN) &&
-      (Offset <= Hexagon_MEMV_OFFSET_MAX);
+    return isShiftedInt<4,6>(Offset);
 
   case Hexagon::PS_vstorerq_ai_128B:
   case Hexagon::PS_vstorerw_ai_128B:
@@ -2454,8 +2449,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::V6_vS32b_ai_128B:
   case Hexagon::V6_vL32Ub_ai_128B:
   case Hexagon::V6_vS32Ub_ai_128B:
-    return (Offset >= Hexagon_MEMV_OFFSET_MIN_128B) &&
-      (Offset <= Hexagon_MEMV_OFFSET_MAX_128B);
+    return isShiftedInt<4,7>(Offset);
 
   case Hexagon::J2_loop0i:
   case Hexagon::J2_loop1i:
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index d73fc7c73185d..de6b203015d8e 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -629,7 +629,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
             if (MO.isReg() && MO.isUse()) {
               unsigned feederReg = MO.getReg();
               for (MachineBasicBlock::iterator localII = feederPos,
-                   end = jmpPos; localII != end; localII++) {
+                   end = cmpInstr->getIterator(); localII != end; localII++) {
                 MachineInstr &localMI = *localII;
                 for (unsigned j = 0; j < localMI.getNumOperands(); j++) {
                   MachineOperand &localMO = localMI.getOperand(j);
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index ee3209354688d..7d961a238ae28 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -100,9 +100,6 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
-
-  private:
-    void ChangeOpInto(MachineOperand &Dst, MachineOperand &Src);
   };
 }
 
@@ -132,7 +129,9 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
     PeepholeDoubleRegsMap.clear();
 
     // Traverse the basic block.
-    for (MachineInstr &MI : *MBB) {
+    for (auto I = MBB->begin(), E = MBB->end(), NextI = I; I != E; I = NextI) {
+      NextI = std::next(I);
+      MachineInstr &MI = *I;
       // Look for sign extends:
       // %vreg170<def> = SXTW %vreg166
       if (!DisableOptSZExt && MI.getOpcode() == Hexagon::A2_sxtw) {
@@ -280,14 +279,13 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
           if (NewOp) {
             unsigned PSrc = MI.getOperand(PR).getReg();
             if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
-              MI.getOperand(PR).setReg(POrig);
+              BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(),
+                      QII->get(NewOp), MI.getOperand(0).getReg())
+                .addReg(POrig)
+                .add(MI.getOperand(S2))
+                .add(MI.getOperand(S1));
               MRI->clearKillFlags(POrig);
-              MI.setDesc(QII->get(NewOp));
-              // Swap operands S1 and S2.
-              MachineOperand Op1 = MI.getOperand(S1);
-              MachineOperand Op2 = MI.getOperand(S2);
-              ChangeOpInto(MI.getOperand(S1), Op2);
-              ChangeOpInto(MI.getOperand(S2), Op1);
+              MI.eraseFromParent();
             }
           } // if (NewOp)
         } // if (!Done)
@@ -299,40 +297,6 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-void HexagonPeephole::ChangeOpInto(MachineOperand &Dst, MachineOperand &Src) {
-  assert (&Dst != &Src && "Cannot duplicate into itself");
-  switch (Dst.getType()) {
-    case MachineOperand::MO_Register:
-      if (Src.isReg()) {
-        Dst.setReg(Src.getReg());
-        Dst.setSubReg(Src.getSubReg());
-        MRI->clearKillFlags(Src.getReg());
-      } else if (Src.isImm()) {
-        Dst.ChangeToImmediate(Src.getImm());
-      } else {
-        llvm_unreachable("Unexpected src operand type");
-      }
-      break;
-
-    case MachineOperand::MO_Immediate:
-      if (Src.isImm()) {
-        Dst.setImm(Src.getImm());
-      } else if (Src.isReg()) {
-        Dst.ChangeToRegister(Src.getReg(), Src.isDef(), Src.isImplicit(),
-                             false, Src.isDead(), Src.isUndef(),
-                             Src.isDebug());
-        Dst.setSubReg(Src.getSubReg());
-      } else {
-        llvm_unreachable("Unexpected src operand type");
-      }
-      break;
-
-    default:
-      llvm_unreachable("Unexpected dst operand type");
-      break;
-  }
-}
-
 FunctionPass *llvm::createHexagonPeephole() {
   return new HexagonPeephole();
 }
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 8851a23ae8ace..0aada8a53c979 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -1,4 +1,4 @@
-//===-- HexagonSubtarget.cpp - Hexagon Subtarget Information --------------===//
+//===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,13 +11,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "HexagonSubtarget.h"
 #include "Hexagon.h"
+#include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
 #include <map>
 
 using namespace llvm;
@@ -119,9 +129,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
 HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
                                    StringRef FS, const TargetMachine &TM)
     : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      FrameLowering() {
-
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {
   initializeEnvironment();
 
   // Initialize scheduling itinerary for the specified CPU.
@@ -196,7 +204,6 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
   updateLatency(*SrcInst, *DstInst, Dep);
 }
 
-
 void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
   for (auto &SU : DAG->SUnits) {
     if (!SU.isInstr())
@@ -240,18 +247,18 @@ void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
   }
 }
 
-
 void HexagonSubtarget::getPostRAMutations(
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(
+      llvm::make_unique<HexagonSubtarget::HexagonDAGMutation>());
 }
 
 void HexagonSubtarget::getSMSMutations(
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(
+      llvm::make_unique<HexagonSubtarget::HexagonDAGMutation>());
 }
 
-
 // Pin the vtable to this file.
 void HexagonSubtarget::anchor() {}
 
@@ -447,4 +454,3 @@ unsigned HexagonSubtarget::getL1PrefetchDistance() const {
 bool HexagonSubtarget::enableSubRegLiveness() const {
   return EnableSubregLiveness;
 }
-
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 4379efa79c9cd..753dca0000652 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -1,4 +1,4 @@
-//===-- HexagonSubtarget.h - Define Subtarget for the Hexagon ---*- C++ -*-===//
+//===- HexagonSubtarget.h - Define Subtarget for the Hexagon ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,12 +15,17 @@
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
 
 #include "HexagonFrameLowering.h"
-#include "HexagonISelLowering.h"
 #include "HexagonInstrInfo.h"
+#include "HexagonISelLowering.h"
 #include "HexagonSelectionDAGInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <memory>
 #include <string>
+#include <vector>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "HexagonGenSubtargetInfo.inc"
@@ -30,6 +35,12 @@
 
 namespace llvm {
 
+class MachineInstr;
+class SDep;
+class SUnit;
+class TargetMachine;
+class Triple;
+
 class HexagonSubtarget : public HexagonGenSubtargetInfo {
   virtual void anchor();
 
@@ -57,6 +68,7 @@ private:
   HexagonSelectionDAGInfo TSInfo;
   HexagonFrameLowering FrameLowering;
   InstrItineraryData InstrItins;
+
   void initializeEnvironment();
 
 public:
@@ -108,6 +120,7 @@ public:
 
   bool useBSBScheduling() const { return UseBSBScheduling; }
   bool enableMachineScheduler() const override;
+
   // Always use the TargetLowering default scheduler.
   // FIXME: This will use the vliw scheduler which is probably just hurting
   // compiler time and will be removed eventually anyway.
@@ -124,6 +137,7 @@ public:
   unsigned getSmallDataThreshold() const {
     return Hexagon_SMALL_DATA_THRESHOLD;
   }
+
   const HexagonArchEnum &getHexagonArchVersion() const {
     return HexagonArchVersion;
   }
@@ -155,4 +169,4 @@ private:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index e507a797871fc..031a1bdefafbf 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -291,7 +291,6 @@ bool HexagonPassConfig::addInstSelector() {
     if (EnableBitSimplify)
       addPass(createHexagonBitSimplify());
     addPass(createHexagonPeephole());
-    printAndVerify("After hexagon peephole pass");
     // Constant propagation.
     if (!DisableHCP) {
       addPass(createHexagonConstPropagationPass());
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 545c8b6b2acde..093ce80bc2e3f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -202,10 +202,8 @@ public:
   /// processFixupValue - Target hook to adjust the literal value of a fixup
   /// if necessary. IsResolved signals whether the caller believes a relocation
   /// is needed; the target can modify the value. The default does nothing.
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override {
+  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                         const MCValue &Target, bool &IsResolved) override {
     MCFixupKind Kind = Fixup.getKind();
 
     switch((unsigned)Kind) {
@@ -415,9 +413,9 @@ public:
   /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
   /// data fragment, at the offset specified by the fixup and following the
   /// fixup kind as appropriate.
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t FixupValue, bool IsPCRel,
-                  MCContext &Ctx) const override {
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t FixupValue, bool IsPCRel) const override {
 
     // When FixupValue is 0 the relocation is external and there
     // is nothing for us to do.
@@ -432,8 +430,8 @@ public:
     // to a real offset before we can use it.
     uint32_t Offset = Fixup.getOffset();
     unsigned NumBytes = getFixupKindNumBytes(Kind);
-    assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-    char *InstAddr = Data + Offset;
+    assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+    char *InstAddr = Data.data() + Offset;
 
     Value = adjustFixupValue(Kind, FixupValue);
     if(!Value)
@@ -517,7 +515,7 @@ public:
           dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) <<
             ": AValue=0x"; dbgs().write_hex(FixupValue) <<
             ": Offset=" << Offset <<
-            ": Size=" << DataSize <<
+            ": Size=" << Data.size() <<
             ": OInst=0x"; dbgs().write_hex(OldData) <<
             ": Reloc=0x"; dbgs().write_hex(Reloc););
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index e8f154a1fa533..c7114c7f18a0a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -701,33 +701,32 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     break;
   case Hexagon::A2_addi:
     Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
-    assert(Absolute);(void)Absolute;
-    if (Value == 1) {
-      Result.setOpcode(Hexagon::SA1_inc);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 1);
-      break;
-    } //  1,2 SUBInst $Rd = add($Rs, #1)
-    else if (Value == -1) {
-      Result.setOpcode(Hexagon::SA1_dec);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 1);
-      addOps(Result, Inst, 2);
-      break;
-    } //  1,2 SUBInst $Rd = add($Rs,#-1)
-    else if (Inst.getOperand(1).getReg() == Hexagon::R29) {
-      Result.setOpcode(Hexagon::SA1_addsp);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 2);
-      break;
-    } //  1,3 SUBInst $Rd = add(r29, #$u6_2)
-    else {
-      Result.setOpcode(Hexagon::SA1_addi);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 1);
-      addOps(Result, Inst, 2);
-      break;
-    } //    1,2,3 SUBInst $Rx = add($Rx, #$s7)
+    if (Absolute) {
+      if (Value == 1) {
+        Result.setOpcode(Hexagon::SA1_inc);
+        addOps(Result, Inst, 0);
+        addOps(Result, Inst, 1);
+        break;
+      } //  1,2 SUBInst $Rd = add($Rs, #1)
+      if (Value == -1) {
+        Result.setOpcode(Hexagon::SA1_dec);
+        addOps(Result, Inst, 0);
+        addOps(Result, Inst, 1);
+        addOps(Result, Inst, 2);
+        break;
+      } //  1,2 SUBInst $Rd = add($Rs,#-1)
+      if (Inst.getOperand(1).getReg() == Hexagon::R29) {
+        Result.setOpcode(Hexagon::SA1_addsp);
+        addOps(Result, Inst, 0);
+        addOps(Result, Inst, 2);
+        break;
+      } //  1,3 SUBInst $Rd = add(r29, #$u6_2)
+    }
+    Result.setOpcode(Hexagon::SA1_addi);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rx = add($Rx, #$s7)
   case Hexagon::A2_add:
     Result.setOpcode(Hexagon::SA1_addrx);
     addOps(Result, Inst, 0);
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index 0ef1401ef531a..c212726113ab7 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -49,8 +49,9 @@ public:
   LanaiAsmBackend(const Target &T, Triple::OSType OST)
       : MCAsmBackend(), OSType(OST) {}
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -88,9 +89,10 @@ bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void LanaiAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                 unsigned /*DataSize*/, uint64_t Value,
-                                 bool /*IsPCRel*/, MCContext & /*Ctx*/) const {
+void LanaiAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                 const MCValue &Target,
+                                 MutableArrayRef<char> Data, uint64_t Value,
+                                 bool /*IsPCRel*/) const {
   MCFixupKind Kind = Fixup.getKind();
   Value = adjustFixupValue(static_cast<unsigned>(Kind), Value);
 
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index d8fdc8ba674e6..982c6fea62d44 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -32,16 +32,20 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+                                     const TargetOptions &Options) {
+  return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16";
+}
+
 MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
                                          const TargetOptions &Options,
                                          Optional<Reloc::Model> RM,
                                          CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, "e-m:e-p:16:16-i32:16:32-a:16-n8:16", TT, CPU, FS,
+    : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS,
                         Options, getEffectiveRelocModel(RM), CM, OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
-      // FIXME: Check DataLayout string.
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 694c201cbe8dc..9d5c179a0fd90 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -322,6 +322,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseDirectiveSet();
   bool parseDirectiveOption();
   bool parseInsnDirective();
+  bool parseRSectionDirective(StringRef Section);
   bool parseSSectionDirective(StringRef Section, unsigned Type);
 
   bool parseSetAtDirective();
@@ -5106,7 +5107,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
 
   CC = StringSwitch<unsigned>(Name)
            .Case("zero", 0)
-           .Case("at", 1)
+           .Cases("at", "AT", 1)
            .Case("a0", 4)
            .Case("a1", 5)
            .Case("a2", 6)
@@ -6952,6 +6953,23 @@ bool MipsAsmParser::parseInsnDirective() {
   return false;
 }
 
+/// parseRSectionDirective
+///  ::= .rdata
+bool MipsAsmParser::parseRSectionDirective(StringRef Section) {
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  MCSection *ELFSection = getContext().getELFSection(
+      Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  getParser().getStreamer().SwitchSection(ELFSection);
+
+  getParser().Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
 /// parseSSectionDirective
 ///  ::= .sbss
 ///  ::= .sdata
@@ -7499,6 +7517,10 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     parseInsnDirective();
     return false;
   }
+  if (IDVal == ".rdata") {
+    parseRSectionDirective(".rodata");
+    return false;
+  }
   if (IDVal == ".sbss") {
     parseSSectionDirective(IDVal, ELF::SHT_NOBITS);
     return false;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 6d3d4db036032..ae48d6e38fa0f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -235,10 +235,12 @@ static unsigned calculateMMLEIndex(unsigned i) {
 /// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
 /// data fragment, at the offset specified by the fixup and following the
 /// fixup kind as appropriate.
-void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                unsigned DataSize, uint64_t Value, bool IsPCRel,
-                                MCContext &Ctx) const {
+void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                const MCValue &Target,
+                                MutableArrayRef<char> Data, uint64_t Value,
+                                bool IsPCRel) const {
   MCFixupKind Kind = Fixup.getKind();
+  MCContext &Ctx = Asm.getContext();
   Value = adjustFixupValue(Fixup, Value, Ctx);
 
   if (!Value)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 4b3cc6e21f4cd..bf3b290b7ed53 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -38,8 +38,9 @@ public:
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 3272319ad50f4..7daea163b8a64 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -326,9 +326,9 @@ class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd, II_AUIPC>;
 class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                     InstrItinClass itin = NoItinerary>
       : MipsR6Arch<instr_asm> {
-  dag OutOperandList = (outs GPROpnd:$rs);
-  dag InOperandList = (ins GPROpnd:$rt, uimm16:$imm);
-  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins GPROpnd:$rs, uimm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
   list<dag> Pattern = [];
   InstrItinClass Itinerary = itin;
 }
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 68708dc4f50fe..02102d6b22f4e 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -907,6 +907,11 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
         if (!(CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1))))
           return SDValue();
       }
+      // Don't generate INS if constant OR operand doesn't fit into bits
+      // cleared by constant AND operand.
+      if (CN->getSExtValue() & CN1->getSExtValue())
+        return SDValue();
+
       SDLoc DL(N);
       EVT ValTy = N->getOperand(0)->getValueType(0);
       SDValue Const1;
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 272595af5f6f1..b95f1158fa562 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -274,8 +274,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
   if (IsPIC) {
     MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
     MF->insert(FallThroughMBB, BalTgtMBB);
-    LongBrMBB->addSuccessor(BalTgtMBB, BranchProbability::getOne());
-    BalTgtMBB->addSuccessor(&*FallThroughMBB, BranchProbability::getOne());
+    LongBrMBB->addSuccessor(BalTgtMBB);
+    BalTgtMBB->addSuccessor(TgtMBB);
 
     // We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal
     // instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
@@ -342,8 +342,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
           .addReg(Mips::SP).addImm(8);
 
       if (Subtarget.hasMips32r6())
-        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR), Mips::ZERO)
-            .addReg(Mips::AT);
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR))
+          .addReg(Mips::ZERO).addReg(Mips::AT);
       else
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
 
@@ -415,8 +415,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
         .addReg(Mips::SP_64).addImm(0);
 
       if (Subtarget.hasMips64r6())
-        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64), Mips::ZERO_64)
-            .addReg(Mips::AT_64);
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64))
+          .addReg(Mips::ZERO_64).addReg(Mips::AT_64);
       else
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
 
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 2382ea2716612..b57bceb3c8371 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1257,19 +1257,22 @@ static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
 static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   EVT ResVecTy = Op->getValueType(0);
   EVT ViaVecTy = ResVecTy;
+  bool BigEndian = !DAG.getSubtarget().getTargetTriple().isLittleEndian();
   SDLoc DL(Op);
 
   // When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and
   // LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating
   // lanes.
-  SDValue LaneA;
-  SDValue LaneB = Op->getOperand(2);
+  SDValue LaneA = Op->getOperand(OpNr);
+  SDValue LaneB;
 
   if (ResVecTy == MVT::v2i64) {
-    LaneA = DAG.getConstant(0, DL, MVT::i32);
+    LaneB = DAG.getConstant(0, DL, MVT::i32);
     ViaVecTy = MVT::v4i32;
+    if(BigEndian)
+      std::swap(LaneA, LaneB);
   } else
-    LaneA = LaneB;
+    LaneB = LaneA;
 
   SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
                       LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
@@ -1277,8 +1280,11 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   SDValue Result = DAG.getBuildVector(
       ViaVecTy, DL, makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
 
-  if (ViaVecTy != ResVecTy)
-    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
+  if (ViaVecTy != ResVecTy) {
+    SDValue One = DAG.getConstant(1, DL, ViaVecTy);
+    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy,
+                         DAG.getNode(ISD::AND, DL, ViaVecTy, Result, One));
+  }
 
   return Result;
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 028c2cb562f8e..6d7eb786a6835 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -113,8 +113,9 @@ public:
     return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
   }
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override {
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override {
     Value = adjustFixupValue(Fixup.getKind(), Value);
     if (!Value) return;           // Doesn't change encoding.
 
@@ -130,10 +131,8 @@ public:
     }
   }
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override {
+  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                         const MCValue &Target, bool &IsResolved) override {
     switch ((PPC::Fixups)Fixup.getKind()) {
     default: break;
     case PPC::fixup_ppc_br24:
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 38ae62b26757a..07c9c1f9f84c0 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -24,7 +24,6 @@ namespace llvm {
   class PPCTargetMachine;
   class PassRegistry;
   class FunctionPass;
-  class ImmutablePass;
   class MachineInstr;
   class AsmPrinter;
   class MCInst;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 57a1d373c88cf..c2c115cb6dafa 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -521,7 +521,7 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
   const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   bool HasBP = RegInfo->hasBasePointer(MF);
   unsigned BPReg  = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
-  unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
+  unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FP8Reg;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI)
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 662550f7a396a..72f14e9691382 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2560,8 +2560,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                PPCII::MO_TPREL_HA);
     SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                PPCII::MO_TPREL_LO);
-    SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
-                                     is64bit ? MVT::i64 : MVT::i32);
+    SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
+                             : DAG.getRegister(PPC::R2, MVT::i32);
+
     SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
     return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
   }
@@ -8377,9 +8378,9 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   if (IntrinsicID == Intrinsic::thread_pointer) {
     // Reads the thread pointer register, used for __builtin_thread_pointer.
-    bool is64bit = Subtarget.isPPC64();
-    return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
-                           is64bit ? MVT::i64 : MVT::i32);
+    if (Subtarget.isPPC64())
+      return DAG.getRegister(PPC::X13, MVT::i64);
+    return DAG.getRegister(PPC::R2, MVT::i32);
   }
 
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 70536a6039b82..e2af5e5295445 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -972,13 +972,15 @@ def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
 
 // Support for medium and large code model.
 let hasSideEffects = 0 in {
+let isReMaterializable = 1 in {
 def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
                        "#ADDIStocHA", []>, isPPC64;
+def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+                     "#ADDItocL", []>, isPPC64;
+}
 let mayLoad = 1 in
 def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
                    "#LDtocL", []>, isPPC64;
-def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
-                     "#ADDItocL", []>, isPPC64;
 }
 
 // Support for thread-local storage.
@@ -994,7 +996,7 @@ def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
                           (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
                  isPPC64;
 
-let isBarrier = 1, isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in
+let isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in
 def CFENCE8 : Pseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>;
 
 def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 236e513bec231..13b4f9ab962da 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -292,6 +292,29 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   return 0;
 }
 
+// For opcodes with the ReMaterializable flag set, this function is called to
+// verify the instruction is really rematable.  
+bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                                     AliasAnalysis *AA) const {
+  switch (MI.getOpcode()) {
+  default: 
+    // This function should only be called for opcodes with the ReMaterializable
+    // flag set.
+    llvm_unreachable("Unknown rematerializable operation!");
+    break;
+  case PPC::LI:
+  case PPC::LI8:
+  case PPC::LIS:
+  case PPC::LIS8:
+  case PPC::QVGPCI:
+  case PPC::ADDIStocHA:
+  case PPC::ADDItocL:
+  case PPC::LOAD_STACK_GUARD:
+    return true;
+  }
+  return false;
+}
+
 unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                           int &FrameIndex) const {
   // Note: This list must be kept consistent with StoreRegToStackSlot.
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 8dd4dbb608794..b0629c88cf57b 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -162,6 +162,8 @@ public:
                              unsigned &SubIdx) const override;
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                         AliasAnalysis *AA) const override;
   unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 637e52bbdbeec..8af7f7e981171 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -389,9 +389,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
 
   if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
-    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
-      .addReg(PPC::R31)
-      .addImm(FrameSize);
+    if (LP64)
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), Reg)
+        .addReg(PPC::X31)
+        .addImm(FrameSize);
+    else
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
+        .addReg(PPC::R31)
+        .addImm(FrameSize);
   } else if (LP64) {
     BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
       .addImm(0)
@@ -478,8 +483,10 @@ void PPCRegisterInfo::lowerDynamicAreaOffset(
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
 
   unsigned maxCallFrameSize = MFI.getMaxCallFrameSize();
+  bool is64Bit = TM.isPPC64();
   DebugLoc dl = MI.getDebugLoc();
-  BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg())
+  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LI8 : PPC::LI),
+          MI.getOperand(0).getReg())
       .addImm(maxCallFrameSize);
   MBB.erase(II);
 }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 5a226b23ff96f..a88a6541e8d00 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -86,9 +86,9 @@ EnableMachineCombinerPass("ppc-machine-combiner",
 
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
-  RegisterTargetMachine<PPC32TargetMachine> A(getThePPC32Target());
-  RegisterTargetMachine<PPC64TargetMachine> B(getThePPC64Target());
-  RegisterTargetMachine<PPC64TargetMachine> C(getThePPC64LETarget());
+  RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
+  RegisterTargetMachine<PPCTargetMachine> B(getThePPC64Target());
+  RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializePPCBoolRetToIntPass(PR);
@@ -177,32 +177,34 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
   assert(Options.MCOptions.getABIName().empty() &&
          "Unknown target-abi option!");
 
-  if (!TT.isMacOSX()) {
-    switch (TT.getArch()) {
-    case Triple::ppc64le:
-      return PPCTargetMachine::PPC_ABI_ELFv2;
-    case Triple::ppc64:
-      return PPCTargetMachine::PPC_ABI_ELFv1;
-    default:
-      // Fallthrough.
-      ;
-    }
+  if (TT.isMacOSX())
+    return PPCTargetMachine::PPC_ABI_UNKNOWN;
+
+  switch (TT.getArch()) {
+  case Triple::ppc64le:
+    return PPCTargetMachine::PPC_ABI_ELFv2;
+  case Triple::ppc64:
+    return PPCTargetMachine::PPC_ABI_ELFv1;
+  default:
+    return PPCTargetMachine::PPC_ABI_UNKNOWN;
   }
-  return PPCTargetMachine::PPC_ABI_UNKNOWN;
 }
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            Optional<Reloc::Model> RM) {
-  if (!RM.hasValue()) {
-    if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) {
-      if (!TT.isOSBinFormatMachO() && !TT.isMacOSX())
-        return Reloc::PIC_;
-    }
-    if (TT.isOSDarwin())
-      return Reloc::DynamicNoPIC;
-    return Reloc::Static;
-  }
-  return *RM;
+  if (RM.hasValue())
+    return *RM;
+
+  // Darwin defaults to dynamic-no-pic.
+  if (TT.isOSDarwin())
+    return Reloc::DynamicNoPIC;
+
+  // Non-darwin 64-bit platforms are PIC by default.
+  if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le)
+    return Reloc::PIC_;
+
+  // 32-bit is static by default.
+  return Reloc::Static;
 }
 
 // The FeatureString here is a little subtle. We are modifying the feature
@@ -224,26 +226,6 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
 
 PPCTargetMachine::~PPCTargetMachine() = default;
 
-void PPC32TargetMachine::anchor() {}
-
-PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Optional<Reloc::Model> RM,
-                                       CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-    : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
-
-void PPC64TargetMachine::anchor() {}
-
-PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Optional<Reloc::Model> RM,
-                                       CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-    : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
-
 const PPCSubtarget *
 PPCTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -406,7 +388,7 @@ void PPCPassConfig::addPreRegAlloc() {
   // FIXME: We probably don't need to run these for -fPIE.
   if (getPPCTargetMachine().isPositionIndependent()) {
     // FIXME: LiveVariables should not be necessary here!
-    // PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on
+    // PPCTLSDynamicCallPass uses LiveIntervals which previously dependent on
     // LiveVariables. This (unnecessary) dependency has been removed now,
     // however a stage-2 clang build fails without LiveVariables computed here.
     addPass(&LiveVariablesID, false);
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index b8f5a2083d808..5eb6ba785d1b8 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -23,7 +23,7 @@ namespace llvm {
 
 /// Common code between 32-bit and 64-bit PowerPC targets.
 ///
-class PPCTargetMachine : public LLVMTargetMachine {
+class PPCTargetMachine final : public LLVMTargetMachine {
 public:
   enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 };
 private:
@@ -60,29 +60,6 @@ public:
     return false;
   }
 };
-
-/// PowerPC 32-bit target machine.
-///
-class PPC32TargetMachine : public PPCTargetMachine {
-  virtual void anchor();
-public:
-  PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                     StringRef FS, const TargetOptions &Options,
-                     Optional<Reloc::Model> RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
-};
-
-/// PowerPC 64-bit target machine.
-///
-class PPC64TargetMachine : public PPCTargetMachine {
-  virtual void anchor();
-public:
-  PPC64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                     StringRef FS, const TargetOptions &Options,
-                     Optional<Reloc::Model> RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
-};
-
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index d9a71893afee7..f85c0cf111c43 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -32,8 +32,9 @@ public:
       : MCAsmBackend(), OSABI(OSABI), Is64Bit(Is64Bit) {}
   ~RISCVAsmBackend() override {}
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -69,9 +70,10 @@ bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void RISCVAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                 unsigned DataSize, uint64_t Value,
-                                 bool IsPCRel, MCContext &Ctx) const {
+void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                 const MCValue &Target,
+                                 MutableArrayRef<char> Data, uint64_t Value,
+                                 bool IsPCRel) const {
   return;
 }
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index d1d1334163a26..c72b47b090857 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -203,10 +203,8 @@ namespace {
       return InfosBE[Kind - FirstTargetFixupKind];
     }
 
-    void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                           const MCFixup &Fixup, const MCFragment *DF,
-                           const MCValue &Target, uint64_t &Value,
-                           bool &IsResolved) override {
+    void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                           const MCValue &Target, bool &IsResolved) override {
       switch ((Sparc::Fixups)Fixup.getKind()) {
       default: break;
       case Sparc::fixup_sparc_wplt30:
@@ -273,9 +271,9 @@ namespace {
     ELFSparcAsmBackend(const Target &T, Triple::OSType OSType) :
       SparcAsmBackend(T), OSType(OSType) { }
 
-    void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                    uint64_t Value, bool IsPCRel,
-                    MCContext &Ctx) const override {
+    void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                    const MCValue &Target, MutableArrayRef<char> Data,
+                    uint64_t Value, bool IsPCRel) const override {
 
       Value = adjustFixupValue(Fixup.getKind(), Value);
       if (!Value) return;           // Doesn't change encoding.
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.cpp b/lib/Target/Sparc/SparcTargetObjectFile.cpp
index 627e49a95f3cc..2c040dce994b6 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -15,6 +15,12 @@
 
 using namespace llvm;
 
+void SparcELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                          const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
 const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
     const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
     MachineModuleInfo *MMI, MCStreamer &Streamer) const {
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.h b/lib/Target/Sparc/SparcTargetObjectFile.h
index fe8800625a567..3b1b345c3b193 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.h
+++ b/lib/Target/Sparc/SparcTargetObjectFile.h
@@ -23,6 +23,8 @@ public:
     TargetLoweringObjectFileELF()
   {}
 
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
   const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
                                         unsigned Encoding,
                                         const TargetMachine &TM,
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index fd1fd7bc40dcc..6b32a7926437a 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -50,8 +50,9 @@ public:
     return SystemZ::NumTargetFixupKinds;
   }
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
   bool mayNeedRelaxation(const MCInst &Inst) const override {
     return false;
   }
@@ -89,15 +90,17 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   return Infos[Kind - FirstTargetFixupKind];
 }
 
-void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                     unsigned DataSize, uint64_t Value,
-                                     bool IsPCRel, MCContext &Ctx) const {
+void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm,
+                                     const MCFixup &Fixup,
+                                     const MCValue &Target,
+                                     MutableArrayRef<char> Data, uint64_t Value,
+                                     bool IsPCRel) const {
   MCFixupKind Kind = Fixup.getKind();
   unsigned Offset = Fixup.getOffset();
   unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
   unsigned Size = (BitSize + 7) / 8;
 
-  assert(Offset + Size <= DataSize && "Invalid fixup offset!");
+  assert(Offset + Size <= Data.size() && "Invalid fixup offset!");
 
   // Big-endian insertion of Size bytes.
   Value = extractBitsForFixup(Kind, Value);
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index a28a91e834f61..0cb2b5a14ce73 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -277,8 +277,21 @@ void SystemZFrameLowering::
 processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                     RegScavenger *RS) const {
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
-  uint64_t MaxReach = (MFFrame.estimateStackSize(MF) +
-                       SystemZMC::CallFrameSize * 2);
+  // Get the size of our stack frame to be allocated ...
+  uint64_t StackSize = (MFFrame.estimateStackSize(MF) +
+                        SystemZMC::CallFrameSize);
+  // ... and the maximum offset we may need to reach into the
+  // caller's frame to access the save area or stack arguments.
+  int64_t MaxArgOffset = SystemZMC::CallFrameSize;
+  for (int I = MFFrame.getObjectIndexBegin(); I != 0; ++I)
+    if (MFFrame.getObjectOffset(I) >= 0) {
+      int64_t ArgOffset = SystemZMC::CallFrameSize +
+                          MFFrame.getObjectOffset(I) +
+                          MFFrame.getObjectSize(I);
+      MaxArgOffset = std::max(MaxArgOffset, ArgOffset);
+    }
+
+  uint64_t MaxReach = StackSize + MaxArgOffset;
   if (!isUInt<12>(MaxReach)) {
     // We may need register scavenging slots if some parts of the frame
     // are outside the reach of an unsigned 12-bit displacement.
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index ac4c3f6db684d..fef4a8c92a362 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1322,11 +1322,6 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
-SDValue SystemZTargetLowering::prepareVolatileOrAtomicLoad(
-    SDValue Chain, const SDLoc &DL, SelectionDAG &DAG) const {
-  return DAG.getNode(SystemZISD::SERIALIZE, DL, MVT::Other, Chain);
-}
-
 // Return true if Op is an intrinsic node with chain that returns the CC value
 // as its only (other) argument.  Provide the associated SystemZISD opcode and
 // the mask of valid CC values if so.
@@ -2059,6 +2054,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
   if (NewC.ICmpType != SystemZICMP::SignedOnly &&
       NewC.Op0.getOpcode() == ISD::SHL &&
       isSimpleShift(NewC.Op0, ShiftVal) &&
+      (MaskVal >> ShiftVal != 0) &&
       (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
                                         MaskVal >> ShiftVal,
                                         CmpVal >> ShiftVal,
@@ -2068,6 +2064,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
   } else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
              NewC.Op0.getOpcode() == ISD::SRL &&
              isSimpleShift(NewC.Op0, ShiftVal) &&
+             (MaskVal << ShiftVal != 0) &&
              (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
                                                MaskVal << ShiftVal,
                                                CmpVal << ShiftVal,
@@ -3212,12 +3209,15 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
-// Op is an atomic load.  Lower it into a normal volatile load.
+// Op is an atomic load.  Lower it into a serialization followed
+// by a normal volatile load.
 SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
                                                 SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
+  SDValue Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
+                                             MVT::Other, Node->getChain()), 0);
   return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
-                        Node->getChain(), Node->getBasePtr(),
+                        Chain, Node->getBasePtr(),
                         Node->getMemoryVT(), Node->getMemOperand());
 }
 
@@ -4688,7 +4688,6 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(STRCMP);
     OPCODE(SEARCH_STRING);
     OPCODE(IPM);
-    OPCODE(SERIALIZE);
     OPCODE(MEMBARRIER);
     OPCODE(TBEGIN);
     OPCODE(TBEGIN_NOFLOAT);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 79c8c4d92669f..5dcb19c0a35db 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -139,9 +139,6 @@ enum NodeType : unsigned {
   // Store the CC value in bits 29 and 28 of an integer.
   IPM,
 
-  // Perform a serialization operation.  (BCR 15,0 or BCR 14,0.)
-  SERIALIZE,
-
   // Compiler barrier only; generate a no-op.
   MEMBARRIER,
 
@@ -471,8 +468,6 @@ public:
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
-  SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL,
-                                      SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
   ISD::NodeType getExtendForAtomicOps() const override {
@@ -522,7 +517,6 @@ private:
                               unsigned Opcode) const;
   SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerLOAD_SEQUENCE_POINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index fa5ecdd852433..9f5e6288348e0 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -189,18 +189,15 @@ let isBranch = 1, isTerminator = 1 in {
 //===----------------------------------------------------------------------===//
 
 // Unconditional trap.
-// FIXME: This trap instruction should be marked as isTerminator, but there is
-// currently a general bug that allows non-terminators to be placed between
-// terminators. Temporarily leave this unmarked until the bug is fixed.
-let isBarrier = 1, hasCtrlDep = 1 in
+let hasCtrlDep = 1 in
   def Trap : Alias<4, (outs), (ins), [(trap)]>;
 
 // Conditional trap.
-let isTerminator = 1, hasCtrlDep = 1, Uses = [CC] in
+let hasCtrlDep = 1, Uses = [CC] in
   def CondTrap : Alias<4, (outs), (ins cond4:$valid, cond4:$R1), []>;
 
 // Fused compare-and-trap instructions.
-let isTerminator = 1, hasCtrlDep = 1 in {
+let hasCtrlDep = 1 in {
   // These patterns work the same way as for compare-and-branch.
   defm CRT   : CmpBranchRRFcPair<"crt",   0xB972, GR32>;
   defm CGRT  : CmpBranchRRFcPair<"cgrt",  0xB960, GR64>;
@@ -1449,7 +1446,7 @@ let Predicates = [FeatureExecutionHint] in {
 // A serialization instruction that acts as a barrier for all memory
 // accesses, which expands to "bcr 14, 0".
 let hasSideEffects = 1 in
-def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>;
+def Serialize : Alias<2, (outs), (ins), []>;
 
 // A pseudo instruction that serves as a compiler barrier.
 let hasSideEffects = 1, hasNoSchedulingInfo = 1 in
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index ab6020f3f1896..b6feaa49d8585 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 #ifndef NDEBUG
 // Print the set of SUs
 void SystemZPostRASchedStrategy::SUSet::
-dump(SystemZHazardRecognizer &HazardRec) {
+dump(SystemZHazardRecognizer &HazardRec) const {
   dbgs() << "{";
   for (auto &SU : *this) {
     HazardRec.dumpSU(SU, dbgs());
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.h b/lib/Target/SystemZ/SystemZMachineScheduler.h
index 12357e0348a9e..3dfef388691e7 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -72,7 +72,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
   // A set of SUs with a sorter and dump method.
   struct SUSet : std::set<SUnit*, SUSorter> {
     #ifndef NDEBUG
-    void dump(SystemZHazardRecognizer &HazardRec);
+    void dump(SystemZHazardRecognizer &HazardRec) const;
     #endif
   };
 
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index adfc69c5d4cf4..ab2392809f3be 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -191,8 +191,6 @@ def z_sdivrem64         : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
 def z_udivrem32         : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>;
 def z_udivrem64         : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>;
 
-def z_serialize         : SDNode<"SystemZISD::SERIALIZE", SDTNone,
-                                 [SDNPHasChain, SDNPMayStore]>;
 def z_membarrier        : SDNode<"SystemZISD::MEMBARRIER", SDTNone,
                                  [SDNPHasChain, SDNPSideEffect]>;
 
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index eb2f17a2091c3..a10ca64fa6329 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -51,8 +51,6 @@ public:
   }
 
   bool targetSchedulesPostRAScheduling() const override { return true; };
-
-  bool isMachineVerifierClean() const override { return false; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 4f20096c15830..1357cb5735f8a 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -37,8 +37,9 @@ public:
       : MCAsmBackend(), Is64Bit(Is64Bit) {}
   ~WebAssemblyAsmBackendELF() override {}
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -77,8 +78,9 @@ public:
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -105,9 +107,11 @@ bool WebAssemblyAsmBackendELF::writeNopData(uint64_t Count,
   return true;
 }
 
-void WebAssemblyAsmBackendELF::applyFixup(const MCFixup &Fixup, char *Data,
-                                          unsigned DataSize, uint64_t Value,
-                                          bool IsPCRel, MCContext &Ctx) const {
+void WebAssemblyAsmBackendELF::applyFixup(const MCAssembler &Asm,
+                                          const MCFixup &Fixup,
+                                          const MCValue &Target,
+                                          MutableArrayRef<char> Data,
+                                          uint64_t Value, bool IsPCRel) const {
   const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
   assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
 
@@ -119,7 +123,7 @@ void WebAssemblyAsmBackendELF::applyFixup(const MCFixup &Fixup, char *Data,
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
@@ -163,9 +167,11 @@ bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
   return true;
 }
 
-void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                       unsigned DataSize, uint64_t Value,
-                                       bool IsPCRel, MCContext &Ctx) const {
+void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm,
+                                       const MCFixup &Fixup,
+                                       const MCValue &Target,
+                                       MutableArrayRef<char> Data,
+                                       uint64_t Value, bool IsPCRel) const {
   const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
   assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
 
@@ -177,7 +183,7 @@ void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index c56c591def361..3e3b52fca5691 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -36,7 +36,6 @@ STATISTIC(MCNumFixups, "Number of MC fixups created.");
 namespace {
 class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
   const MCInstrInfo &MCII;
-  MCContext &Ctx;
 
   // Implementation generated by tablegen.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -48,14 +47,12 @@ class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
                          const MCSubtargetInfo &STI) const override;
 
 public:
-  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
-      : MCII(mcii), Ctx(ctx) {}
+  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
 };
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
-                                                    MCContext &Ctx) {
-  return new WebAssemblyMCCodeEmitter(MCII, Ctx);
+MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) {
+  return new WebAssemblyMCCodeEmitter(MCII);
 }
 
 void WebAssemblyMCCodeEmitter::encodeInstruction(
@@ -89,11 +86,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
         } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
           encodeSLEB128(int64_t(MO.getImm()), OS);
         } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
-          Fixups.push_back(MCFixup::create(
-              OS.tell() - Start, MCConstantExpr::create(MO.getImm(), Ctx),
-              MCFixupKind(WebAssembly::fixup_code_global_index), MI.getLoc()));
-          ++MCNumFixups;
-          encodeULEB128(uint64_t(MO.getImm()), OS);
+          llvm_unreachable("wasm globals should only be accessed symbolicly");
         } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
           encodeSLEB128(int64_t(MO.getImm()), OS);
         } else {
@@ -135,6 +128,9 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
                  Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
         FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32);
         PaddedSize = 5;
+      } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_global_index);
+        PaddedSize = 5;
       } else {
         llvm_unreachable("unexpected symbolic operand kind");
       }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 9fd3ec81c258f..9580eeaa33d73 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -74,7 +74,7 @@ static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
 static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo & /*MRI*/,
                                         MCContext &Ctx) {
-  return createWebAssemblyMCCodeEmitter(MCII, Ctx);
+  return createWebAssemblyMCCodeEmitter(MCII);
 }
 
 static MCAsmBackend *createAsmBackend(const Target & /*T*/,
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 0ba700a86b744..4d676c32a09c5 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -35,8 +35,7 @@ class raw_pwrite_stream;
 Target &getTheWebAssemblyTarget32();
 Target &getTheWebAssemblyTarget64();
 
-MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
-                                              MCContext &Ctx);
+MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
 
 MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
 
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index 19e14f3261aa7..9cf77829f3bc2 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -68,6 +68,8 @@ WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
   bool IsFunction = IsFunctionExpr(Fixup.getValue());
 
   switch (unsigned(Fixup.getKind())) {
+  case WebAssembly::fixup_code_global_index:
+    return wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB;
   case WebAssembly::fixup_code_sleb128_i32:
     if (IsFunction)
       return wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB;
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index b999091e2d294..f51585a10ca12 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -96,13 +96,6 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
                                MCConstantExpr::create(Size, OutContext));
     }
   }
-
-  if (!TM.getTargetTriple().isOSBinFormatELF()) {
-    MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>();
-    getTargetStreamer()->emitGlobal(MMIW.getGlobals());
-    if (MMIW.hasStackPointerGlobal())
-      getTargetStreamer()->emitStackPointer(MMIW.getStackPointerGlobal());
-  }
 }
 
 void WebAssemblyAsmPrinter::EmitConstantPool() {
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 09338a4898e03..c980f4b87f916 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -63,12 +63,16 @@ class WebAssemblyFastISel final : public FastISel {
   public:
     // Innocuous defaults for our address.
     Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
-    void setKind(BaseKind K) { Kind = K; }
+    void setKind(BaseKind K) {
+      assert(!isSet() && "Can't change kind with non-zero base");
+      Kind = K;
+    }
     BaseKind getKind() const { return Kind; }
     bool isRegBase() const { return Kind == RegBase; }
     bool isFIBase() const { return Kind == FrameIndexBase; }
     void setReg(unsigned Reg) {
       assert(isRegBase() && "Invalid base register access!");
+      assert(Base.Reg == 0 && "Overwriting non-zero register");
       Base.Reg = Reg;
     }
     unsigned getReg() const {
@@ -77,6 +81,7 @@ class WebAssemblyFastISel final : public FastISel {
     }
     void setFI(unsigned FI) {
       assert(isFIBase() && "Invalid base frame index access!");
+      assert(Base.FI == 0 && "Overwriting non-zero frame index");
       Base.FI = FI;
     }
     unsigned getFI() const {
@@ -91,6 +96,13 @@ class WebAssemblyFastISel final : public FastISel {
     int64_t getOffset() const { return Offset; }
     void setGlobalValue(const GlobalValue *G) { GV = G; }
     const GlobalValue *getGlobalValue() const { return GV; }
+    bool isSet() const {
+      if (isRegBase()) {
+        return Base.Reg != 0;
+      } else {
+        return Base.FI != 0;
+      }
+    }
   };
 
   /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
@@ -297,6 +309,9 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
     DenseMap<const AllocaInst *, int>::iterator SI =
         FuncInfo.StaticAllocaMap.find(AI);
     if (SI != FuncInfo.StaticAllocaMap.end()) {
+      if (Addr.isSet()) {
+        return false;
+      }
       Addr.setKind(Address::FrameIndexBase);
       Addr.setFI(SI->second);
       return true;
@@ -341,6 +356,9 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
     break;
   }
   }
+  if (Addr.isSet()) {
+    return false;
+  }
   Addr.setReg(getRegForValue(Obj));
   return Addr.getReg() != 0;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 4209bc333f230..a37d6136e44ed 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -104,10 +104,10 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
                             const DebugLoc &DL) {
   const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
 
+  const char *ES = "__stack_pointer";
+  auto *SPSymbol = MF.createExternalSymbolName(ES);
   if (MF.getSubtarget<WebAssemblySubtarget>()
         .getTargetTriple().isOSBinFormatELF()) {
-    const char *ES = "__stack_pointer";
-    auto *SPSymbol = MF.createExternalSymbolName(ES);
     MachineRegisterInfo &MRI = MF.getRegInfo();
     const TargetRegisterClass *PtrRC =
         MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
@@ -125,10 +125,8 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
         .addReg(SrcReg)
         .addMemOperand(MMO);
   } else {
-    MachineModuleInfoWasm &MMIW =
-        MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
     BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::SET_GLOBAL_I32))
-        .addImm(MMIW.getStackPointerGlobal())
+        .addExternalSymbol(SPSymbol)
         .addReg(SrcReg);
   }
 }
@@ -171,10 +169,11 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   unsigned SPReg = WebAssembly::SP32;
   if (StackSize)
     SPReg = MRI.createVirtualRegister(PtrRC);
+
+  const char *ES = "__stack_pointer";
+  auto *SPSymbol = MF.createExternalSymbolName(ES);
   if (MF.getSubtarget<WebAssemblySubtarget>()
         .getTargetTriple().isOSBinFormatELF()) {
-    const char *ES = "__stack_pointer";
-    auto *SPSymbol = MF.createExternalSymbolName(ES);
     unsigned Zero = MRI.createVirtualRegister(PtrRC);
 
     BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
@@ -189,22 +188,8 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
         .addReg(Zero)    // addr
         .addMemOperand(LoadMMO);
   } else {
-    auto &MMIW = MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
-    if (!MMIW.hasStackPointerGlobal()) {
-      MMIW.setStackPointerGlobal(MMIW.getGlobals().size());
-
-      // Create the stack-pointer global. For now, just use the
-      // Emscripten/Binaryen ABI names.
-      wasm::Global G;
-      G.Type = wasm::ValType::I32;
-      G.Mutable = true;
-      G.InitialValue = 0;
-      G.InitialModule = "env";
-      G.InitialName = "STACKTOP";
-      MMIW.addGlobal(G);
-    }
     BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GET_GLOBAL_I32), SPReg)
-        .addImm(MMIW.getStackPointerGlobal());
+        .addExternalSymbol(SPSymbol);
   }
 
   bool HasBP = hasBP(MF);
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 6650191807dcb..ea9e3fa862ce2 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -170,28 +170,16 @@ static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
   if (MI.mayStore()) {
     Write = true;
 
-    const MachineFunction &MF = *MI.getParent()->getParent();
-    if (MF.getSubtarget<WebAssemblySubtarget>()
-          .getTargetTriple().isOSBinFormatELF()) {
-      // Check for stores to __stack_pointer.
-      for (auto MMO : MI.memoperands()) {
-        const MachinePointerInfo &MPI = MMO->getPointerInfo();
-        if (MPI.V.is<const PseudoSourceValue *>()) {
-          auto PSV = MPI.V.get<const PseudoSourceValue *>();
-          if (const ExternalSymbolPseudoSourceValue *EPSV =
-                  dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
-            if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
-              StackPointer = true;
-        }
-      }
-    } else {
-      // Check for sets of the stack pointer.
-      const MachineModuleInfoWasm &MMIW =
-          MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
-      if ((MI.getOpcode() == WebAssembly::SET_LOCAL_I32 ||
-           MI.getOpcode() == WebAssembly::SET_LOCAL_I64) &&
-          MI.getOperand(0).getImm() == MMIW.getStackPointerGlobal()) {
-        StackPointer = true;
+    // Check for stores to __stack_pointer.
+    for (auto MMO : MI.memoperands()) {
+      const MachinePointerInfo &MPI = MMO->getPointerInfo();
+      if (MPI.V.is<const PseudoSourceValue *>()) {
+        auto PSV = MPI.V.get<const PseudoSourceValue *>();
+        if (const ExternalSymbolPseudoSourceValue *EPSV =
+                dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
+          if (StringRef(EPSV->getSymbol()) == "__stack_pointer") {
+            StackPointer = true;
+          }
       }
     }
   } else if (MI.hasOrderedMemoryRef()) {
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e5d3209ec6a97..d30cc724c203f 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1705,8 +1705,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
   if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
     SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
     unsigned Len = DotDispStr.size();
-    unsigned Val = OrigDispVal + DotDispVal;
-    InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val);
+    InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, DotDispVal);
   }
 
   NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 7a9e4f4468ec7..914fb36f91a7d 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -108,12 +108,12 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override {
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override {
     unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
 
-    assert(Fixup.getOffset() + Size <= DataSize &&
-           "Invalid fixup offset!");
+    assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
 
     // Check that uppper bits are either all zeros or all ones.
     // Specifically ignore overflow/underflow as long as the leakage is
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 4097ef224d503..caf98bffb80de 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -153,8 +153,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     const MCSymbol *B_Base = Asm.getAtom(*B);
 
     // Neither symbol can be modified.
-    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
-        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None) {
       Asm.getContext().reportError(Fixup.getLoc(),
                                    "unsupported relocation of modified symbol");
       return;
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 105580c913a16..5892f1de33eec 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -10,6 +10,7 @@
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
@@ -25,8 +26,8 @@ public:
   X86WinCOFFObjectWriter(bool Is64Bit);
   ~X86WinCOFFObjectWriter() override = default;
 
-  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsCrossSection,
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsCrossSection,
                         const MCAsmBackend &MAB) const override;
 };
 
@@ -36,11 +37,19 @@ X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
     : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
                                           : COFF::IMAGE_FILE_MACHINE_I386) {}
 
-unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+                                              const MCValue &Target,
                                               const MCFixup &Fixup,
                                               bool IsCrossSection,
                                               const MCAsmBackend &MAB) const {
-  unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+  unsigned FixupKind = Fixup.getKind();
+  if (IsCrossSection) {
+    if (FixupKind != FK_Data_4) {
+      Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression");
+      return COFF::IMAGE_REL_AMD64_ADDR32;
+    }
+    FixupKind = FK_PCRel_4;
+  }
 
   MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
     MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 2777fa89330f6..e3aa227702bea 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -748,17 +748,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
   else
     CallOp = X86::CALLpcrel32;
 
-  const char *Symbol;
-  if (Is64Bit) {
-    if (STI.isTargetCygMing()) {
-      Symbol = "___chkstk_ms";
-    } else {
-      Symbol = "__chkstk";
-    }
-  } else if (STI.isTargetCygMing())
-    Symbol = "_alloca";
-  else
-    Symbol = "_chkstk";
+  StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);
 
   MachineInstrBuilder CI;
   MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
@@ -769,10 +759,11 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
     // For the large code model, we have to call through a register. Use R11,
     // as it is scratch in all supported calling conventions.
     BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
-        .addExternalSymbol(Symbol);
+        .addExternalSymbol(MF.createExternalSymbolName(Symbol));
     CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
   } else {
-    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
+        .addExternalSymbol(MF.createExternalSymbolName(Symbol));
   }
 
   unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
@@ -783,13 +774,16 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
       .addReg(SP, RegState::Define | RegState::Implicit)
       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
 
-  if (Is64Bit) {
+  if (STI.isTargetWin64() || !STI.isOSWindows()) {
+    // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
     // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
-    // themselves. It also does not clobber %rax so we can reuse it when
+    // themselves. They also does not clobber %rax so we can reuse it when
     // adjusting %rsp.
-    BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
-        .addReg(X86::RSP)
-        .addReg(X86::RAX);
+    // All other platforms do not specify a particular ABI for the stack probe
+    // function, so we arbitrarily define it to not adjust %esp/%rsp itself.
+    BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP)
+        .addReg(SP)
+        .addReg(AX);
   }
 
   if (InProlog) {
@@ -978,7 +972,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     X86FI->setCalleeSavedFrameSize(
       X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 
-  bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
+  bool UseRedZone = false;
+  bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
 
   // The default stack probe size is 4096 if the function has no stackprobesize
   // attribute.
@@ -1007,6 +1002,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       !TRI->needsStackRealignment(MF) &&
       !MFI.hasVarSizedObjects() &&             // No dynamic alloca.
       !MFI.adjustsStack() &&                   // No calls.
+      !UseStackProbe &&                        // No stack probes.
       !IsWin64CC &&                            // Win64 has no Red Zone
       !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
       !MF.shouldSplitStack()) {                // Regular stack
@@ -1015,6 +1011,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
     MFI.setStackSize(StackSize);
+    UseRedZone = true;
   }
 
   // Insert stack pointer adjustment for later moving of return addr.  Only
@@ -1192,6 +1189,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
     AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
   if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+    assert(!UseRedZone && "The Red Zone is not accounted for in stack probes");
+
     // Check whether EAX is livein for this block.
     bool isEAXAlive = isEAXLiveIn(MBB);
 
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 2a1633de0a239..3c4589ab18f6f 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -204,6 +204,11 @@ namespace {
     bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
                           SDValue &Scale, SDValue &Index, SDValue &Disp,
                           SDValue &Segment);
+    template <class GatherScatterSDNode>
+    bool selectAddrOfGatherScatterNode(GatherScatterSDNode *Parent, SDValue N,
+                                       SDValue &Base, SDValue &Scale,
+                                       SDValue &Index, SDValue &Disp,
+                                       SDValue &Segment);
     bool selectMOV64Imm32(SDValue N, SDValue &Imm);
     bool selectLEAAddr(SDValue N, SDValue &Base,
                        SDValue &Scale, SDValue &Index, SDValue &Disp,
@@ -1415,13 +1420,10 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
   return false;
 }
 
-bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
-                                      SDValue &Scale, SDValue &Index,
-                                      SDValue &Disp, SDValue &Segment) {
-
-  MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent);
-  if (!Mgs)
-    return false;
+template <class GatherScatterSDNode>
+bool X86DAGToDAGISel::selectAddrOfGatherScatterNode(
+    GatherScatterSDNode *Mgs, SDValue N, SDValue &Base, SDValue &Scale,
+    SDValue &Index, SDValue &Disp, SDValue &Segment) {
   X86ISelAddressMode AM;
   unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
   // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
@@ -1453,6 +1455,18 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
   return true;
 }
 
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                                       SDValue &Scale, SDValue &Index,
+                                       SDValue &Disp, SDValue &Segment) {
+  if (auto Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent))
+    return selectAddrOfGatherScatterNode<MaskedGatherScatterSDNode>(
+        Mgs, N, Base, Scale, Index, Disp, Segment);
+  if (auto X86Gather = dyn_cast<X86MaskedGatherSDNode>(Parent))
+    return selectAddrOfGatherScatterNode<X86MaskedGatherSDNode>(
+        X86Gather, N, Base, Scale, Index, Disp, Segment);
+  return false;
+}
+
 /// Returns true if it is able to pattern match an addressing mode.
 /// It returns the operands which make up the maximal addressing mode it can
 /// match by reference.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 172eba0002d4f..f777e56289884 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1662,6 +1662,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
+
+  // TODO: These control memcmp expansion in CGP and are set low to prevent
+  // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder.
+  MaxLoadsPerMemcmp = 1;
+  MaxLoadsPerMemcmpOptSize = 1;
+
   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
 
@@ -14272,9 +14278,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   // If we are inserting a element, see if we can do this more efficiently with
   // a blend shuffle with a rematerializable vector than a costly integer
   // insertion.
-  // TODO: pre-SSE41 targets will tend to use bit masking - this could still
-  // be beneficial if we are inserting several zeros and can combine the masks.
-  if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
+  if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
+      16 <= EltVT.getSizeInBits()) {
     SmallVector<int, 8> BlendMask;
     for (unsigned i = 0; i != NumElts; ++i)
       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
@@ -17621,23 +17626,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
-
       SDValue CmpOp0 = Cmp.getOperand(0);
+
       // Apply further optimizations for special cases
       // (select (x != 0), -1, 0) -> neg & sbb
       // (select (x == 0), 0, -1) -> neg & sbb
       if (isNullConstant(Y) &&
-            (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
-          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
-          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
-                                    DAG.getConstant(0, DL,
-                                                    CmpOp0.getValueType()),
-                                    CmpOp0);
-          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
-                                    DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                    SDValue(Neg.getNode(), 1));
-          return Res;
-        }
+          (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+        SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+        SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
+        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
+        SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                  SDValue(Neg.getNode(), 1));
+        return Res;
+      }
 
       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
@@ -18648,8 +18651,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
+  bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
-               SplitStack;
+               SplitStack || EmitStackProbe;
   SDLoc dl(Op);
 
   // Get the inputs.
@@ -23705,6 +23709,57 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
     SDValue RetOps[] = {Exract, NewGather.getValue(1)};
     return DAG.getMergeValues(RetOps, dl);
   }
+  if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
+    // There is a special case when the return type is v2i32 is illegal and
+    // the type legaizer extended it to v2i64. Without this conversion we end up
+    // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
+    // In order to avoid this situation, we'll build an X86 specific Gather node
+    // with index v2i64 and value type v4i32.
+    assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
+           "Unexpected type in masked gather");
+    Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
+                                DAG.getBitcast(MVT::v4i32, Src0),
+                                DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
+    // The mask should match the destination type. Extending mask with zeroes
+    // is not necessary since instruction itself reads only two values from
+    // memory.
+    Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
+    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+    SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+      DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
+      N->getMemOperand());
+
+    SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
+                                  NewGather.getValue(0), DAG);
+    SDValue RetOps[] = { Sext, NewGather.getValue(1) };
+    return DAG.getMergeValues(RetOps, dl);
+  }
+  if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
+    // This transformation is for optimization only.
+    // The type legalizer extended mask and index to 4 elements vector
+    // in order to match requirements of the common gather node - same
+    // vector width of index and value. X86 Gather node allows mismatch
+    // of vector width in order to select more optimal instruction at the
+    // end.
+    assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
+           "Unexpected type in masked gather");
+    if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
+        ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
+        Index.getOpcode() == ISD::CONCAT_VECTORS &&
+        Index.getOperand(1).isUndef()) {
+      Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
+      Index = Index.getOperand(0);
+    } else
+      return Op;
+    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+    SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+      DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
+      N->getMemOperand());
+
+    SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
+    return DAG.getMergeValues(RetOps, dl);
+
+  }
   return Op;
 }
 
@@ -24508,6 +24563,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
   case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
   case X86ISD::LWPINS:             return "X86ISD::LWPINS";
+  case X86ISD::MGATHER:            return "X86ISD::MGATHER";
   }
   return nullptr;
 }
@@ -29868,7 +29924,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
     if (N->getValueType(0) == MVT::i32)
       Diff = (unsigned)Diff;
 
-    bool isFastMultiplier = false;
+    bool IsFastMultiplier = false;
     if (Diff < 10) {
       switch ((unsigned char)Diff) {
       default:
@@ -29880,12 +29936,12 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
       case 5: // result = lea base(cond, cond*4)
       case 8: // result = lea base(    , cond*8)
       case 9: // result = lea base(cond, cond*8)
-        isFastMultiplier = true;
+        IsFastMultiplier = true;
         break;
       }
     }
 
-    if (isFastMultiplier) {
+    if (IsFastMultiplier) {
       APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
       if (NeedsCondInvert) // Invert the condition if needed.
         Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
@@ -34841,23 +34897,56 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
       !Cmp.getOperand(0).getValueType().isInteger())
     return SDValue();
 
-  // (cmp Z, 1) sets the carry flag if Z is 0.
   SDValue Z = Cmp.getOperand(0);
-  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
-                               DAG.getConstant(1, DL, Z.getValueType()));
+  EVT ZVT = Z.getValueType();
+
+  // If X is -1 or 0, then we have an opportunity to avoid constants required in
+  // the general case below.
+  if (auto *ConstantX = dyn_cast<ConstantSDNode>(X)) {
+    // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
+    // fake operands:
+    //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
+    // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
+    if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
+        (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
+      SDValue Zero = DAG.getConstant(0, DL, ZVT);
+      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+      SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
+      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                         DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                         SDValue(Neg.getNode(), 1));
+    }
+
+    // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
+    // with fake operands:
+    //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
+    // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
+    if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
+        (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
+      SDValue One = DAG.getConstant(1, DL, ZVT);
+      SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                         DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+    }
+  }
 
-  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+  // (cmp Z, 1) sets the carry flag if Z is 0.
+  SDValue One = DAG.getConstant(1, DL, ZVT);
+  SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+
+  // Add the flags type for ADC/SBB nodes.
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
 
   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
   if (CC == X86::COND_NE)
     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
-                       DAG.getConstant(-1ULL, DL, VT), NewCmp);
+                       DAG.getConstant(-1ULL, DL, VT), Cmp1);
 
   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
-                     DAG.getConstant(0, DL, VT), NewCmp);
+                     DAG.getConstant(0, DL, VT), Cmp1);
 }
 
 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
@@ -34976,6 +35065,32 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
 }
 
+/// Convert vector increment or decrement to sub/add with an all-ones constant:
+/// add X, <1, 1...> --> sub X, <-1, -1...>
+/// sub X, <1, 1...> --> add X, <-1, -1...>
+/// The all-ones vector constant can be materialized using a pcmpeq instruction
+/// that is commonly recognized as an idiom (has no register dependency), so
+/// that's better/smaller than loading a splat 1 constant.
+static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB &&
+         "Unexpected opcode for increment/decrement transform");
+
+  // Pseudo-legality check: getOnesVector() expects one of these types, so bail
+  // out and wait for legalization if we have an unsupported vector length.
+  EVT VT = N->getValueType(0);
+  if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
+    return SDValue();
+
+  SDNode *N1 = N->getOperand(1).getNode();
+  APInt SplatVal;
+  if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue())
+    return SDValue();
+
+  SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
+  unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+  return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
+}
+
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   const SDNodeFlags Flags = N->getFlags();
@@ -34995,6 +35110,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
       isHorizontalBinOp(Op0, Op1, true))
     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
 
+  if (SDValue V = combineIncDecVector(N, DAG))
+    return V;
+
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
@@ -35028,6 +35146,9 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
       isHorizontalBinOp(Op0, Op1, false))
     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
 
+  if (SDValue V = combineIncDecVector(N, DAG))
+    return V;
+
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
@@ -36335,3 +36456,22 @@ void X86TargetLowering::insertCopiesSplitCSR(
 bool X86TargetLowering::supportSwiftError() const {
   return Subtarget.is64Bit();
 }
+
+/// Returns the name of the symbol used to emit stack probes or the empty
+/// string if not applicable.
+StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+  // If the function specifically requests stack probes, emit them.
+  if (MF.getFunction()->hasFnAttribute("probe-stack"))
+    return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
+
+  // Generally, if we aren't on Windows, the platform ABI does not include
+  // support for stack probes, so don't emit them.
+  if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
+    return "";
+
+  // We need a stack probe to conform to the Windows ABI. Choose the right
+  // symbol.
+  if (Subtarget.is64Bit())
+    return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
+  return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index f51b6641db2fb..e1ade92979dc0 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -615,7 +615,10 @@ namespace llvm {
       // Vector truncating store with unsigned/signed saturation
       VTRUNCSTOREUS, VTRUNCSTORES,
       // Vector truncating masked store with unsigned/signed saturation
-      VMTRUNCSTOREUS, VMTRUNCSTORES
+      VMTRUNCSTOREUS, VMTRUNCSTORES,
+
+      // X86 specific gather
+      MGATHER
 
       // WARNING: Do not add anything in the end unless you want the node to
       // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
@@ -1056,6 +1059,8 @@ namespace llvm {
 
     bool supportSwiftError() const override;
 
+    StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
+
     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
     /// \brief Lower interleaved load(s) into target specific
@@ -1065,6 +1070,12 @@ namespace llvm {
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
 
+    /// \brief Lower interleaved store(s) into target specific
+    /// instructions/intrinsics.
+    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+                               unsigned Factor) const override;
+
+
     void finalizeLowering(MachineFunction &MF) const override;
 
   protected:
@@ -1397,6 +1408,19 @@ namespace llvm {
     }
   };
 
+  // X86 specific Gather node.
+  class X86MaskedGatherSDNode : public MaskedGatherScatterSDNode {
+  public:
+    X86MaskedGatherSDNode(unsigned Order,
+                          const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+                          MachineMemOperand *MMO)
+      : MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT, MMO)
+    {}
+    static bool classof(const SDNode *N) {
+      return N->getOpcode() == X86ISD::MGATHER;
+    }
+  };
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 2620679df2517..01a70323224c3 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -7265,13 +7265,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
   let Predicates = [HasAVX512] in {
   def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
-             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>;
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x9))), _.FRC)>;
   def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
-             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>;
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xa))), _.FRC)>;
   def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
-             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>;
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xb))), _.FRC)>;
   def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
              (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
@@ -7281,13 +7281,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
 
   def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
-             addr:$src, (i32 0x1))), _.FRC)>;
+             addr:$src, (i32 0x9))), _.FRC)>;
   def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
-             addr:$src, (i32 0x2))), _.FRC)>;
+             addr:$src, (i32 0xa))), _.FRC)>;
   def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
-             addr:$src, (i32 0x3))), _.FRC)>;
+             addr:$src, (i32 0xb))), _.FRC)>;
   def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
              addr:$src, (i32 0x4))), _.FRC)>;
@@ -7869,7 +7869,7 @@ let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
                                           vx128xmem, mgatherv4i32>, EVEX_V128;
   defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
-                                          vx64xmem, mgatherv2i64>, EVEX_V128;
+                                          vx64xmem, X86mgatherv2i64>, EVEX_V128;
 }
 }
 
@@ -8471,26 +8471,26 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
 }
 let Predicates = [HasAVX512] in {
 def : Pat<(v16f32 (ffloor VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>;
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
 def : Pat<(v16f32 (fnearbyint VR512:$src)),
           (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
 def : Pat<(v16f32 (fceil VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>;
+          (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>;
 def : Pat<(v16f32 (frint VR512:$src)),
           (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
 def : Pat<(v16f32 (ftrunc VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>;
+          (VRNDSCALEPSZrri VR512:$src, (i32 0xB))>;
 
 def : Pat<(v8f64 (ffloor VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>;
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>;
 def : Pat<(v8f64 (fnearbyint VR512:$src)),
           (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
 def : Pat<(v8f64 (fceil VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>;
+          (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>;
 def : Pat<(v8f64 (frint VR512:$src)),
           (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
 def : Pat<(v8f64 (ftrunc VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>;
+          (VRNDSCALEPDZrri VR512:$src, (i32 0xB))>;
 }
 
 defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index c28b35b22977a..8b5bbf24f6f63 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -641,7 +641,7 @@ def sdmem : Operand<v2f64> {
 // SSE pattern fragments
 //===----------------------------------------------------------------------===//
 
-// Vector load wrappers to prevent folding of non-temporal aligned loads on 
+// Vector load wrappers to prevent folding of non-temporal aligned loads on
 // supporting targets.
 def vec128load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return !Subtarget->hasSSE41() || !cast<LoadSDNode>(N)->isNonTemporal() ||
@@ -754,16 +754,6 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
 
-// These are needed to match a scalar memop that is used in a vector-only
-// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
-// The memory operand is required to be a 128-bit load, so it must be converted
-// from a vector to a scalar.
-def memopfsf32_128 : PatFrag<(ops node:$ptr),
-  (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>;
-def memopfsf64_128 : PatFrag<(ops node:$ptr),
-  (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>;
-
-
 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
 // 16-byte boundary.
 // FIXME: 8 byte alignment for mmx reads is not required
@@ -773,6 +763,9 @@ def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
 
 def memopmmx  : PatFrag<(ops node:$ptr), (x86mmx  (memop64 node:$ptr))>;
 
+def X86masked_gather  : SDNode<"X86ISD::MGATHER",  SDTMaskedGather,
+                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
 def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
   (masked_gather node:$src1, node:$src2, node:$src3) , [{
   if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
@@ -796,6 +789,15 @@ def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
             Mgt->getBasePtr().getValueType() == MVT::v2i64);
   return false;
 }]>;
+def X86mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (X86MaskedGatherSDNode *Mgt = dyn_cast<X86MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
+            Mgt->getBasePtr().getValueType() == MVT::v2i64) &&
+           (Mgt->getMemoryVT() == MVT::v2i32 ||
+            Mgt->getMemoryVT() == MVT::v2f32);
+  return false;
+}]>;
 def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
   (masked_gather node:$src1, node:$src2, node:$src3) , [{
   if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 8490b972eb5c1..fe87bbd994738 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1744,7 +1744,7 @@ def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
                       Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
 }
 
-def : Pat<(f32 (fpround FR64:$src)), 
+def : Pat<(f32 (fpround FR64:$src)),
             (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
           Requires<[UseAVX]>;
 
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 77dead8d24137..f98c2a7e802dd 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -72,9 +72,24 @@ private:
                   MachineFunction &MF) const;
   bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
                  MachineFunction &MF) const;
-
   bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI,
                    MachineFunction &MF) const;
+  bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
+                    MachineFunction &MF) const;
+  bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
+                     MachineFunction &MF) const;
+
+  // emit insert subreg instruction and insert it before MachineInstr &I
+  bool emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+                        MachineRegisterInfo &MRI, MachineFunction &MF) const;
+  // emit extract subreg instruction and insert it before MachineInstr &I
+  bool emitExtractSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+                         MachineRegisterInfo &MRI, MachineFunction &MF) const;
+
+  const TargetRegisterClass *getRegClass(LLT Ty, const RegisterBank &RB) const;
+  const TargetRegisterClass *getRegClass(LLT Ty, unsigned Reg,
+                                         MachineRegisterInfo &MRI) const;
 
   const X86TargetMachine &TM;
   const X86Subtarget &STI;
@@ -113,8 +128,8 @@ X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM,
 
 // FIXME: This should be target-independent, inferred from the types declared
 // for each class in the bank.
-static const TargetRegisterClass *
-getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const {
   if (RB.getID() == X86::GPRRegBankID) {
     if (Ty.getSizeInBits() <= 8)
       return &X86::GR8RegClass;
@@ -127,13 +142,13 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
   }
   if (RB.getID() == X86::VECRRegBankID) {
     if (Ty.getSizeInBits() == 32)
-      return &X86::FR32XRegClass;
+      return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
     if (Ty.getSizeInBits() == 64)
-      return &X86::FR64XRegClass;
+      return STI.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
     if (Ty.getSizeInBits() == 128)
-      return &X86::VR128XRegClass;
+      return STI.hasAVX512() ? &X86::VR128XRegClass : &X86::VR128RegClass;
     if (Ty.getSizeInBits() == 256)
-      return &X86::VR256XRegClass;
+      return STI.hasAVX512() ? &X86::VR256XRegClass : &X86::VR256RegClass;
     if (Ty.getSizeInBits() == 512)
       return &X86::VR512RegClass;
   }
@@ -141,10 +156,16 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
   llvm_unreachable("Unknown RegBank!");
 }
 
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, unsigned Reg,
+                                    MachineRegisterInfo &MRI) const {
+  const RegisterBank &RegBank = *RBI.getRegBank(Reg, MRI, TRI);
+  return getRegClass(Ty, RegBank);
+}
+
 // Set X86 Opcode and constrain DestReg.
-static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
-                       MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
-                       const RegisterBankInfo &RBI) {
+bool X86InstructionSelector::selectCopy(MachineInstr &I,
+                                        MachineRegisterInfo &MRI) const {
 
   unsigned DstReg = I.getOperand(0).getReg();
   if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
@@ -171,7 +192,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   switch (RegBank.getID()) {
   case X86::GPRRegBankID:
     assert((DstSize <= 64) && "GPRs cannot get more than 64-bit width values.");
-    RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+    RC = getRegClass(MRI.getType(DstReg), RegBank);
 
     // Change the physical register
     if (SrcSize > DstSize && TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
@@ -186,7 +207,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
     }
     break;
   case X86::VECRRegBankID:
-    RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+    RC = getRegClass(MRI.getType(DstReg), RegBank);
     break;
   default:
     llvm_unreachable("Unknown RegBank!");
@@ -220,7 +241,7 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
     // Certain non-generic instructions also need some special handling.
 
     if (I.isCopy())
-      return selectCopy(I, TII, MRI, TRI, RBI);
+      return selectCopy(I, MRI);
 
     // TODO: handle more cases - LOAD_STACK_GUARD, PHI
     return true;
@@ -249,6 +270,10 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
     return true;
   if (selectUadde(I, MRI, MF))
     return true;
+  if (selectExtract(I, MRI, MF))
+    return true;
+  if (selectInsert(I, MRI, MF))
+    return true;
 
   return false;
 }
@@ -326,6 +351,34 @@ unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
   return Opc;
 }
 
+// Fill in an address from the given instruction.
+void X86SelectAddress(const MachineInstr &I, const MachineRegisterInfo &MRI,
+                      X86AddressMode &AM) {
+
+  assert(I.getOperand(0).isReg() && "unsupported opperand.");
+  assert(MRI.getType(I.getOperand(0).getReg()).isPointer() &&
+         "unsupported type.");
+
+  if (I.getOpcode() == TargetOpcode::G_GEP) {
+    if (auto COff = getConstantVRegVal(I.getOperand(2).getReg(), MRI)) {
+      int64_t Imm = *COff;
+      if (isInt<32>(Imm)) { // Check for displacement overflow.
+        AM.Disp = static_cast<int32_t>(Imm);
+        AM.Base.Reg = I.getOperand(1).getReg();
+        return;
+      }
+    }
+  } else if (I.getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+    AM.Base.FrameIndex = I.getOperand(1).getIndex();
+    AM.BaseType = X86AddressMode::FrameIndexBase;
+    return;
+  }
+
+  // Default behavior.
+  AM.Base.Reg = I.getOperand(0).getReg();
+  return;
+}
+
 bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
                                                MachineRegisterInfo &MRI,
                                                MachineFunction &MF) const {
@@ -340,18 +393,28 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
   const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
   auto &MemOp = **I.memoperands_begin();
+  if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+    DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+    return false;
+  }
+
   unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment());
   if (NewOpc == Opc)
     return false;
 
+  X86AddressMode AM;
+  X86SelectAddress(*MRI.getVRegDef(I.getOperand(1).getReg()), MRI, AM);
+
   I.setDesc(TII.get(NewOpc));
   MachineInstrBuilder MIB(MF, I);
-  if (Opc == TargetOpcode::G_LOAD)
-    addOffset(MIB, 0);
-  else {
+  if (Opc == TargetOpcode::G_LOAD) {
+    I.RemoveOperand(1);
+    addFullAddress(MIB, AM);
+  } else {
     // G_STORE (VAL, Addr), X86Store instruction (Addr, VAL)
+    I.RemoveOperand(1);
     I.RemoveOperand(0);
-    addOffset(MIB, 0).addUse(DefReg);
+    addFullAddress(MIB, AM).addUse(DefReg);
   }
   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 }
@@ -461,11 +524,11 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
   if (DstRB.getID() != X86::GPRRegBankID)
     return false;
 
-  const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
+  const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
   if (!DstRC)
     return false;
 
-  const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
+  const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
   if (!SrcRC)
     return false;
 
@@ -519,9 +582,8 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
     else
       return false;
 
-    const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
     unsigned DefReg =
-        MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank));
+        MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
 
     BuildMI(*I.getParent(), I, I.getDebugLoc(),
             TII.get(TargetOpcode::SUBREG_TO_REG), DefReg)
@@ -656,6 +718,202 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I,
   return true;
 }
 
+bool X86InstructionSelector::selectExtract(MachineInstr &I,
+                                           MachineRegisterInfo &MRI,
+                                           MachineFunction &MF) const {
+
+  if (I.getOpcode() != TargetOpcode::G_EXTRACT)
+    return false;
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned SrcReg = I.getOperand(1).getReg();
+  int64_t Index = I.getOperand(2).getImm();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+
+  // Meanwile handle vector type only.
+  if (!DstTy.isVector())
+    return false;
+
+  if (Index % DstTy.getSizeInBits() != 0)
+    return false; // Not extract subvector.
+
+  if (Index == 0) {
+    // Replace by extract subreg copy.
+    if (!emitExtractSubreg(DstReg, SrcReg, I, MRI, MF))
+      return false;
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  bool HasAVX = STI.hasAVX();
+  bool HasAVX512 = STI.hasAVX512();
+  bool HasVLX = STI.hasVLX();
+
+  if (SrcTy.getSizeInBits() == 256 && DstTy.getSizeInBits() == 128) {
+    if (HasVLX)
+      I.setDesc(TII.get(X86::VEXTRACTF32x4Z256rr));
+    else if (HasAVX)
+      I.setDesc(TII.get(X86::VEXTRACTF128rr));
+    else
+      return false;
+  } else if (SrcTy.getSizeInBits() == 512 && HasAVX512) {
+    if (DstTy.getSizeInBits() == 128)
+      I.setDesc(TII.get(X86::VEXTRACTF32x4Zrr));
+    else if (DstTy.getSizeInBits() == 256)
+      I.setDesc(TII.get(X86::VEXTRACTF64x4Zrr));
+    else
+      return false;
+  } else
+    return false;
+
+  // Convert to X86 VEXTRACT immediate.
+  Index = Index / DstTy.getSizeInBits();
+  I.getOperand(2).setImm(Index);
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg,
+                                               MachineInstr &I,
+                                               MachineRegisterInfo &MRI,
+                                               MachineFunction &MF) const {
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+  unsigned SubIdx = X86::NoSubRegister;
+
+  if (!DstTy.isVector() || !SrcTy.isVector())
+    return false;
+
+  assert(SrcTy.getSizeInBits() > DstTy.getSizeInBits() &&
+         "Incorrect Src/Dst register size");
+
+  if (DstTy.getSizeInBits() == 128)
+    SubIdx = X86::sub_xmm;
+  else if (DstTy.getSizeInBits() == 256)
+    SubIdx = X86::sub_ymm;
+  else
+    return false;
+
+  const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+  const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+
+  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx);
+
+  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+    DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+    return false;
+  }
+
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), DstReg)
+      .addReg(SrcReg, 0, SubIdx);
+
+  return true;
+}
+
+bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg,
+                                              MachineInstr &I,
+                                              MachineRegisterInfo &MRI,
+                                              MachineFunction &MF) const {
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+  unsigned SubIdx = X86::NoSubRegister;
+
+  // TODO: support scalar types
+  if (!DstTy.isVector() || !SrcTy.isVector())
+    return false;
+
+  assert(SrcTy.getSizeInBits() < DstTy.getSizeInBits() &&
+         "Incorrect Src/Dst register size");
+
+  if (SrcTy.getSizeInBits() == 128)
+    SubIdx = X86::sub_xmm;
+  else if (SrcTy.getSizeInBits() == 256)
+    SubIdx = X86::sub_ymm;
+  else
+    return false;
+
+  const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+  const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+
+  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+    DEBUG(dbgs() << "Failed to constrain INSERT_SUBREG\n");
+    return false;
+  }
+
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY))
+      .addReg(DstReg, RegState::DefineNoRead, SubIdx)
+      .addReg(SrcReg);
+
+  return true;
+}
+
+bool X86InstructionSelector::selectInsert(MachineInstr &I,
+                                          MachineRegisterInfo &MRI,
+                                          MachineFunction &MF) const {
+
+  if (I.getOpcode() != TargetOpcode::G_INSERT)
+    return false;
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned SrcReg = I.getOperand(1).getReg();
+  const unsigned InsertReg = I.getOperand(2).getReg();
+  int64_t Index = I.getOperand(3).getImm();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT InsertRegTy = MRI.getType(InsertReg);
+
+  // Meanwile handle vector type only.
+  if (!DstTy.isVector())
+    return false;
+
+  if (Index % InsertRegTy.getSizeInBits() != 0)
+    return false; // Not insert subvector.
+
+  if (Index == 0 && MRI.getVRegDef(SrcReg)->isImplicitDef()) {
+    // Replace by subreg copy.
+    if (!emitInsertSubreg(DstReg, InsertReg, I, MRI, MF))
+      return false;
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  bool HasAVX = STI.hasAVX();
+  bool HasAVX512 = STI.hasAVX512();
+  bool HasVLX = STI.hasVLX();
+
+  if (DstTy.getSizeInBits() == 256 && InsertRegTy.getSizeInBits() == 128) {
+    if (HasVLX)
+      I.setDesc(TII.get(X86::VINSERTF32x4Z256rr));
+    else if (HasAVX)
+      I.setDesc(TII.get(X86::VINSERTF128rr));
+    else
+      return false;
+  } else if (DstTy.getSizeInBits() == 512 && HasAVX512) {
+    if (InsertRegTy.getSizeInBits() == 128)
+      I.setDesc(TII.get(X86::VINSERTF32x4Zrr));
+    else if (InsertRegTy.getSizeInBits() == 256)
+      I.setDesc(TII.get(X86::VINSERTF64x4Zrr));
+    else
+      return false;
+  } else
+    return false;
+
+  // Convert to X86 VINSERT immediate.
+  Index = Index / InsertRegTy.getSizeInBits();
+
+  I.getOperand(3).setImm(Index);
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
 InstructionSelector *
 llvm::createX86InstructionSelector(const X86TargetMachine &TM,
                                    X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index 806d6cc888f0f..f0ed4bc16e2f9 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -16,6 +16,7 @@
 
 #include "X86ISelLowering.h"
 #include "X86TargetMachine.h"
+#include "llvm/Analysis/VectorUtils.h"
 
 using namespace llvm;
 
@@ -50,9 +51,8 @@ class X86InterleavedAccessGroup {
   IRBuilder<> &Builder;
 
   /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
-  /// sub vectors of type \p T. Returns true and the sub-vectors in
-  /// \p DecomposedVectors if it decomposes the Inst, returns false otherwise.
-  bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
+  /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
+  void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
                  SmallVectorImpl<Instruction *> &DecomposedVectors);
 
   /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
@@ -80,8 +80,7 @@ public:
   /// target information \p STarget.
   explicit X86InterleavedAccessGroup(Instruction *I,
                                      ArrayRef<ShuffleVectorInst *> Shuffs,
-                                     ArrayRef<unsigned> Ind,
-                                     const unsigned F,
+                                     ArrayRef<unsigned> Ind, const unsigned F,
                                      const X86Subtarget &STarget,
                                      IRBuilder<> &B)
       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
@@ -102,48 +101,61 @@ bool X86InterleavedAccessGroup::isSupported() const {
   uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
   Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
 
-  if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize)
-    return false;
+  // Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
+  uint64_t ExpectedShuffleVecSize;
+  if (isa<LoadInst>(Inst))
+    ExpectedShuffleVecSize = 256;
+  else
+    ExpectedShuffleVecSize = 1024;
 
-  // Currently, lowering is supported for 64 bits on AVX.
-  if (!Subtarget.hasAVX() || ShuffleVecSize != 256 ||
+  if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize ||
       DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
     return false;
 
   return true;
 }
 
-bool X86InterleavedAccessGroup::decompose(
+void X86InterleavedAccessGroup::decompose(
     Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
     SmallVectorImpl<Instruction *> &DecomposedVectors) {
+
+  assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
+         "Expected Load or Shuffle");
+
   Type *VecTy = VecInst->getType();
   (void)VecTy;
   assert(VecTy->isVectorTy() &&
          DL.getTypeSizeInBits(VecTy) >=
              DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
          "Invalid Inst-size!!!");
-  assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() &&
-         "Element type mismatched!!!");
 
-  if (!isa<LoadInst>(VecInst))
-    return false;
+  if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
+    Value *Op0 = SVI->getOperand(0);
+    Value *Op1 = SVI->getOperand(1);
+
+    // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
+    for (unsigned i = 0; i < NumSubVectors; ++i)
+      DecomposedVectors.push_back(
+          cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
+              Op0, Op1, createSequentialMask(Builder, Indices[i],
+                                             SubVecTy->getVectorNumElements(), 0))));
+    return;
+  }
 
+  // Decompose the load instruction.
   LoadInst *LI = cast<LoadInst>(VecInst);
   Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
-
   Value *VecBasePtr =
       Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
 
-  // Generate N loads of T type
+  // Generate N loads of T type.
   for (unsigned i = 0; i < NumSubVectors; i++) {
-    // TODO: Support inbounds GEP
+    // TODO: Support inbounds GEP.
     Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
     Instruction *NewLoad =
         Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
     DecomposedVectors.push_back(NewLoad);
   }
-
-  return true;
 }
 
 void X86InterleavedAccessGroup::transpose_4x4(
@@ -181,21 +193,46 @@ void X86InterleavedAccessGroup::transpose_4x4(
 // instructions/intrinsics.
 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
   SmallVector<Instruction *, 4> DecomposedVectors;
-  VectorType *VecTy = Shuffles[0]->getType();
-  // Try to generate target-sized register(/instruction).
-  if (!decompose(Inst, Factor, VecTy, DecomposedVectors))
-    return false;
-
   SmallVector<Value *, 4> TransposedVectors;
-  // Perform matrix-transposition in order to compute interleaved
-  // results by generating some sort of (optimized) target-specific
-  // instructions.
+  VectorType *ShuffleTy = Shuffles[0]->getType();
+
+  if (isa<LoadInst>(Inst)) {
+    // Try to generate target-sized register(/instruction).
+    decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
+
+    // Perform matrix-transposition in order to compute interleaved
+    // results by generating some sort of (optimized) target-specific
+    // instructions.
+    transpose_4x4(DecomposedVectors, TransposedVectors);
+
+    // Now replace the unoptimized-interleaved-vectors with the
+    // transposed-interleaved vectors.
+    for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
+      Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+
+    return true;
+  }
+
+  Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
+  unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
+
+  // Lower the interleaved stores:
+  //   1. Decompose the interleaved wide shuffle into individual shuffle
+  //   vectors.
+  decompose(Shuffles[0], Factor,
+            VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors);
+
+  //   2. Transpose the interleaved-vectors into vectors of contiguous
+  //      elements.
   transpose_4x4(DecomposedVectors, TransposedVectors);
 
-  // Now replace the unoptimized-interleaved-vectors with the
-  // transposed-interleaved vectors.
-  for (unsigned i = 0; i < Shuffles.size(); i++)
-    Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+  //   3. Concatenate the contiguous-vectors back into a wide vector.
+  Value *WideVec = concatenateVectors(Builder, TransposedVectors);
+
+  //   4. Generate a store instruction for wide-vec.
+  StoreInst *SI = cast<StoreInst>(Inst);
+  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
+                             SI->getAlignment());
 
   return true;
 }
@@ -220,3 +257,29 @@ bool X86TargetLowering::lowerInterleavedLoad(
 
   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 }
+
+bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+                                              ShuffleVectorInst *SVI,
+                                              unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+
+  assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
+         "Invalid interleaved store");
+
+  // Holds the indices of SVI that correspond to the starting index of each
+  // interleaved shuffle.
+  SmallVector<unsigned, 4> Indices;
+  auto Mask = SVI->getShuffleMask();
+  for (unsigned i = 0; i < Factor; i++)
+    Indices.push_back(Mask[i]);
+
+  ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
+
+  // Create an interleaved access group.
+  IRBuilder<> Builder(SI);
+  X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
+                                Builder);
+
+  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
+}
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index bc73bb1ae8c51..6b1add8ff8ed1 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -510,12 +510,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::SHUF128, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
                      X86ISD::SHUF128, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_b_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_b_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_b_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_d_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_d_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_d_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
@@ -524,16 +518,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
                      X86ISD::CMPM_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_q_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_q_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_q_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_sd,     CMP_MASK_SCALAR_CC,
                      X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
   X86_INTRINSIC_DATA(avx512_mask_cmp_ss,     CMP_MASK_SCALAR_CC,
                      X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_w_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_w_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_w_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_compress_d_128,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::COMPRESS, 0),
   X86_INTRINSIC_DATA(avx512_mask_compress_d_256,  COMPRESS_EXPAND_IN_REG,
@@ -1171,18 +1159,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::FSUBS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FSUBS_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
                      X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 979aaee110aa4..a584eabcc1b28 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -214,12 +214,24 @@ void X86LegalizerInfo::setLegalizerInfoAVX() {
   if (!Subtarget.hasAVX())
     return;
 
+  const LLT v16s8 = LLT::vector(16, 8);
+  const LLT v8s16 = LLT::vector(8, 16);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  const LLT v32s8 = LLT::vector(32, 8);
+  const LLT v16s16 = LLT::vector(16, 16);
   const LLT v8s32 = LLT::vector(8, 32);
   const LLT v4s64 = LLT::vector(4, 64);
 
   for (unsigned MemOp : {G_LOAD, G_STORE})
     for (auto Ty : {v8s32, v4s64})
       setAction({MemOp, Ty}, Legal);
+
+  for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
+    setAction({G_INSERT, Ty}, Legal);
+  for (auto Ty : {v16s8, v8s16, v4s32, v2s64})
+    setAction({G_INSERT, 1, Ty}, Legal);
 }
 
 void X86LegalizerInfo::setLegalizerInfoAVX2() {
@@ -243,6 +255,18 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
   if (!Subtarget.hasAVX512())
     return;
 
+  const LLT v16s8 = LLT::vector(16, 8);
+  const LLT v8s16 = LLT::vector(8, 16);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  const LLT v32s8 = LLT::vector(32, 8);
+  const LLT v16s16 = LLT::vector(16, 16);
+  const LLT v8s32 = LLT::vector(8, 32);
+  const LLT v4s64 = LLT::vector(4, 64);
+
+  const LLT v64s8 = LLT::vector(64, 8);
+  const LLT v32s16 = LLT::vector(32, 16);
   const LLT v16s32 = LLT::vector(16, 32);
   const LLT v8s64 = LLT::vector(8, 64);
 
@@ -256,13 +280,15 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
     for (auto Ty : {v16s32, v8s64})
       setAction({MemOp, Ty}, Legal);
 
+  for (auto Ty : {v64s8, v32s16, v16s32, v8s64})
+    setAction({G_INSERT, Ty}, Legal);
+  for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64})
+    setAction({G_INSERT, 1, Ty}, Legal);
+
   /************ VLX *******************/
   if (!Subtarget.hasVLX())
     return;
 
-  const LLT v4s32 = LLT::vector(4, 32);
-  const LLT v8s32 = LLT::vector(8, 32);
-
   for (auto Ty : {v4s32, v8s32})
     setAction({G_MUL, Ty}, Legal);
 }
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index dd21e2b7c4a13..8fdf10617059a 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -2,39 +2,31 @@
 //
 //                     The LLVM Compiler Infrastructure
 //
-// \file This file is distributed under the University of Illinois Open Source
+// This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the X86 implementation of the DAG scheduling mutation to
-// pair instructions back to back.
+/// \file This file contains the X86 implementation of the DAG scheduling
+/// mutation to pair instructions back to back.
 //
 //===----------------------------------------------------------------------===//
 
 #include "X86MacroFusion.h"
 #include "X86Subtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
-
-#define DEBUG_TYPE "misched"
-
-STATISTIC(NumFused, "Number of instr pairs fused");
+#include "llvm/CodeGen/MacroFusion.h"
 
 using namespace llvm;
 
-static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden,
-  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
-
-namespace {
-
-/// \brief Verify that the instruction pair, First and Second,
-/// should be scheduled back to back.  If either instruction is unspecified,
-/// then verify that the other instruction may be part of a pair at all.
-static bool shouldScheduleAdjacent(const X86Subtarget &ST,
-                                   const MachineInstr *First,
-                                   const MachineInstr *Second) {
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI);
   // Check if this processor supports macro-fusion. Since this is a minor
   // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
   // proxy for SandyBridge+.
@@ -47,13 +39,10 @@ static bool shouldScheduleAdjacent(const X86Subtarget &ST,
     FuseInc
   } FuseKind;
 
-  assert((First || Second) && "At least one instr must be specified");
-  unsigned FirstOpcode = First
-                         ? First->getOpcode()
+  unsigned FirstOpcode = FirstMI
+                         ? FirstMI->getOpcode()
                          : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
-  unsigned SecondOpcode = Second
-                          ? Second->getOpcode()
-                          : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
 
   switch (SecondOpcode) {
   default:
@@ -203,69 +192,11 @@ static bool shouldScheduleAdjacent(const X86Subtarget &ST,
   }
 }
 
-/// \brief Post-process the DAG to create cluster edges between instructions
-/// that may be fused by the processor into a single operation.
-class X86MacroFusion : public ScheduleDAGMutation {
-public:
-  X86MacroFusion() {}
-
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-
-void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
-  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-  const X86Subtarget &ST = DAG->MF.getSubtarget<X86Subtarget>();
-
-  // For now, assume targets can only fuse with the branch.
-  SUnit &ExitSU = DAG->ExitSU;
-  MachineInstr *Branch = ExitSU.getInstr();
-  if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch))
-    return;
-
-  for (SDep &PredDep : ExitSU.Preds) {
-    if (PredDep.isWeak())
-      continue;
-    SUnit &SU = *PredDep.getSUnit();
-    MachineInstr &Pred = *SU.getInstr();
-    if (!shouldScheduleAdjacent(ST, &Pred, Branch))
-      continue;
-
-    // Create a single weak edge from SU to ExitSU. The only effect is to cause
-    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
-    // need to copy predecessor edges from ExitSU to SU, since top-down
-    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
-    // of SU, we could create an artificial edge from the deepest root, but it
-    // hasn't been needed yet.
-    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
-    (void)Success;
-    assert(Success && "No DAG nodes should be reachable from ExitSU");
-
-    // Adjust latency of data deps between the nodes.
-    for (SDep &PredDep : ExitSU.Preds)
-      if (PredDep.getSUnit() == &SU)
-        PredDep.setLatency(0);
-    for (SDep &SuccDep : SU.Succs)
-      if (SuccDep.getSUnit() == &ExitSU)
-        SuccDep.setLatency(0);
-
-    ++NumFused;
-    DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
-          SU.print(dbgs(), DAG);
-          dbgs() << " - ExitSU"
-                 << " / " << DAG->TII->getName(Pred.getOpcode()) << " - "
-                 << DAG->TII->getName(Branch->getOpcode()) << '\n';);
-
-    break;
-  }
-}
-
-} // end namespace
-
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation>
 createX86MacroFusionDAGMutation () {
-  return EnableMacroFusion ? make_unique<X86MacroFusion>() : nullptr;
+  return createBranchMacroFusionDAGMutation(shouldScheduleAdjacent);
 }
 
 } // end namespace llvm
diff --git a/lib/Target/X86/X86MacroFusion.h b/lib/Target/X86/X86MacroFusion.h
index e630f802e8e63..13fa2d78a0185 100644
--- a/lib/Target/X86/X86MacroFusion.h
+++ b/lib/Target/X86/X86MacroFusion.h
@@ -2,23 +2,18 @@
 //
 //                     The LLVM Compiler Infrastructure
 //
-// \file This file is distributed under the University of Illinois Open Source
+// This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the X86 definition of the DAG scheduling mutation to pair
-// instructions back to back.
+/// \file This file contains the X86 definition of the DAG scheduling mutation
+///  to pair instructions back to back.
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86InstrInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
-//===----------------------------------------------------------------------===//
-// X86MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
 namespace llvm {
 
 /// Note that you have to add:
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 278b57eb00b74..a9f42cacf7886 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -91,6 +91,8 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
     return llvm::make_unique<X86FreeBSDTargetObjectFile>();
   if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
     return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
+  if (TT.isOSSolaris())
+    return llvm::make_unique<X86SolarisTargetObjectFile>();
   if (TT.isOSFuchsia())
     return llvm::make_unique<X86FuchsiaTargetObjectFile>();
   if (TT.isOSBinFormatELF())
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 4fd95717478e9..8627c06d44313 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -86,6 +86,12 @@ X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
   InitializeELF(TM.Options.UseInitArray);
 }
 
+void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx,
+                                            const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
 const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
     const GlobalValue *LHS, const GlobalValue *RHS,
     const TargetMachine &TM) const {
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 39d2e84e5ed77..f6aa570b6332a 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -66,6 +66,11 @@ namespace llvm {
     void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
   };
 
+  /// \brief This implementation is used for Solaris on x86/x86-64.
+  class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile {
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+  };
+
   /// \brief This implementation is used for Windows targets on x86 and x86-64.
   class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
     const MCExpr *
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 11ba7025e1b73..5ba8534d32d33 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2178,17 +2178,6 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
 }
 
-bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                               TargetTransformInfo::LSRCost &C2) {
-    // X86 specific here are "instruction number 1st priority".
-    return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
-                    C1.NumIVMuls, C1.NumBaseAdds,
-                    C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
-           std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
-                    C2.NumIVMuls, C2.NumBaseAdds,
-                    C2.ScaleCost, C2.ImmCost, C2.SetupCost);
-}
-
 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
   Type *ScalarTy = DataTy->getScalarType();
   int DataWidth = isa<PointerType>(ScalarTy) ?
@@ -2243,6 +2232,12 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
+bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  // TODO: We can increase these based on available vector ops.
+  MaxLoadSize = ST->is64Bit() ? 8 : 4;
+  return true;
+}
+
 bool X86TTIImpl::enableInterleavedAccessVectorization() {
   // TODO: We expect this to be beneficial regardless of arch,
   // but there are currently some unexplained performance artifacts on Atom.
@@ -2250,6 +2245,114 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
   return !(ST->isAtom());
 }
 
+// Get estimation for interleaved load/store operations for AVX2.
+// \p Factor is the interleaved-access factor (stride) - number of
+// (interleaved) elements in the group.
+// \p Indices contains the indices for a strided load: when the
+// interleaved load has gaps they indicate which elements are used.
+// If Indices is empty (or if the number of indices is equal to the size
+// of the interleaved-access as given in \p Factor) the access has no gaps.
+//
+// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
+// computing the cost using a generic formula as a function of generic
+// shuffles. We therefore use a lookup table instead, filled according to
+// the instruction sequences that codegen currently generates.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+
+  // We currently Support only fully-interleaved groups, with no gaps.
+  // TODO: Support also strided loads (interleaved-groups with gaps).
+  if (Indices.size() && Indices.size() != Factor)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  // VecTy for interleave memop is <VF*Factor x Elt>.
+  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+  // VecTy = <12 x i32>.
+  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+
+  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
+  // the VF=2, while v2i128 is an unsupported MVT vector type
+  // (see MachineValueType.h::getVectorVT()).
+  if (!LegalVT.isVector())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  unsigned VF = VecTy->getVectorNumElements() / Factor;
+  Type *ScalarTy = VecTy->getVectorElementType();
+  
+  // Calculate the number of memory operations (NumOfMemOps), required
+  // for load/store the VecTy.
+  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+  unsigned LegalVTSize = LegalVT.getStoreSize();
+  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+  // Get the cost of one memory operation.
+  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+                                        LegalVT.getVectorNumElements());
+  unsigned MemOpCost =
+      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+  
+  VectorType *VT = VectorType::get(ScalarTy, VF);
+  EVT ETy = TLI->getValueType(DL, VT);
+  if (!ETy.isSimple())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  // TODO: Complete for other data-types and strides.
+  // Each combination of Stride, ElementTy and VF results in a different
+  // sequence; The cost tables are therefore accessed with:
+  // Factor (stride) and VectorType=VFxElemType.
+  // The Cost accounts only for the shuffle sequence;
+  // The cost of the loads/stores is accounted for separately.
+  //
+  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+    { 3, MVT::v2i8,  10 }, //(load 6i8 and)  deinterleave into 3 x 2i8
+    { 3, MVT::v4i8,  4 },  //(load 12i8 and) deinterleave into 3 x 4i8
+    { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
+    { 3, MVT::v16i8, 18},  //(load 48i8 and) deinterleave into 3 x 16i8
+    { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
+    
+    { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
+    { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
+    { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
+    { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
+    { 4, MVT::v32i8, 80 }  //(load 128i8 and) deinterleave into 4 x 32i8
+  };
+
+  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+    { 3, MVT::v2i8,  7 },  //interleave 3 x 2i8  into 6i8 (and store)
+    { 3, MVT::v4i8,  8 },  //interleave 3 x 4i8  into 12i8 (and store)
+    { 3, MVT::v8i8,  11 }, //interleave 3 x 8i8  into 24i8 (and store)
+    { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
+    { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
+
+    { 4, MVT::v2i8,  12 }, //interleave 4 x 2i8  into 8i8 (and store)
+    { 4, MVT::v4i8,  9 },  //interleave 4 x 4i8  into 16i8 (and store)
+    { 4, MVT::v8i8,  16 }, //interleave 4 x 8i8  into 32i8 (and store)
+    { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
+    { 4, MVT::v32i8, 40 }  //interleave 4 x 32i8 into 128i8 (and store)
+  };
+
+  if (Opcode == Instruction::Load) {
+    if (const auto *Entry =
+            CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
+      return NumOfMemOps * MemOpCost + Entry->Cost;
+  } else {
+    assert(Opcode == Instruction::Store &&
+           "Expected Store Instruction at this  point");
+    if (const auto *Entry = 
+            CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
+      return NumOfMemOps * MemOpCost + Entry->Cost;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
+
 // Get estimation for interleaved load/store operations and strided load.
 // \p Indices contains indices for strided load.
 // \p Factor - the factor of interleaving.
@@ -2358,6 +2461,10 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
     return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
                                             Alignment, AddressSpace);
+  if (ST->hasAVX2())
+    return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
+                                          Alignment, AddressSpace);
+  
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace);
 }
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 09ce2c90498d9..ad0a0a2113012 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -93,6 +93,9 @@ public:
   int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
                                  unsigned Alignment, unsigned AddressSpace);
+  int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor, ArrayRef<unsigned> Indices,
+                                 unsigned Alignment, unsigned AddressSpace);
 
   int getIntImmCost(int64_t);
 
@@ -101,15 +104,13 @@ public:
   int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
   int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                     Type *Ty);
-  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                     TargetTransformInfo::LSRCost &C2);
   bool isLegalMaskedLoad(Type *DataType);
   bool isLegalMaskedStore(Type *DataType);
   bool isLegalMaskedGather(Type *DataType);
   bool isLegalMaskedScatter(Type *DataType);
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
-
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
 private:
   int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
diff --git a/lib/Testing/Support/LLVMBuild.txt b/lib/Testing/Support/LLVMBuild.txt
index 40853e8172d55..173cfb4a5587e 100644
--- a/lib/Testing/Support/LLVMBuild.txt
+++ b/lib/Testing/Support/LLVMBuild.txt
@@ -20,3 +20,4 @@ type = Library
 name = TestingSupport
 parent = Libraries
 required_libraries = Support
+installed = 0
diff --git a/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index 797e4ffc2d456..f304b9c9a8dac 100644
--- a/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -31,7 +31,7 @@ namespace {
 
 enum {
   OPT_INVALID = 0,
-#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11) OPT_##ID,
+#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID,
 #include "Options.inc"
 #undef OPTION
 };
@@ -41,11 +41,9 @@ enum {
 #undef PREFIX
 
 static const llvm::opt::OptTable::Info infoTable[] = {
-#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10)    \
-  {                                                                    \
-    X1, X2, X9, X10, OPT_##ID, llvm::opt::Option::KIND##Class, X8, X7, \
-    OPT_##GROUP, OPT_##ALIAS, X6                                       \
-  },
+#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10, X11)       \
+  {X1, X2, X9,          X10,         OPT_##ID, llvm::opt::Option::KIND##Class, \
+   X8, X7, OPT_##GROUP, OPT_##ALIAS, X6,       X11},
 #include "Options.inc"
 #undef OPTION
 };
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 4bc64ab698ff9..087a8aa2c624d 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -72,10 +72,6 @@ static cl::opt<bool>
 RunLoopRerolling("reroll-loops", cl::Hidden,
                  cl::desc("Run the loop rerolling pass"));
 
-static cl::opt<bool> RunLoadCombine("combine-loads", cl::init(false),
-                                    cl::Hidden,
-                                    cl::desc("Run the load combining pass"));
-
 static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden,
                                cl::desc("Run the NewGVN pass"));
 
@@ -174,7 +170,6 @@ PassManagerBuilder::PassManagerBuilder() {
     SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
     RerollLoops = RunLoopRerolling;
-    LoadCombine = RunLoadCombine;
     NewGVN = RunNewGVN;
     DisableGVNLoadPRE = false;
     VerifyInput = false;
@@ -296,6 +291,8 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) {
     InstrProfOptions Options;
     if (!PGOInstrGen.empty())
       Options.InstrProfileOutput = PGOInstrGen;
+    Options.DoCounterPromotion = true;
+    MPM.add(createLoopRotatePass());
     MPM.add(createInstrProfilingLegacyPass(Options));
   }
   if (!PGOInstrUse.empty())
@@ -407,9 +404,6 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     }
   }
 
-  if (LoadCombine)
-    MPM.add(createLoadCombinePass());
-
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
   // Clean up after everything.
@@ -850,9 +844,6 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // alignments.
   PM.add(createAlignmentFromAssumptionsPass());
 
-  if (LoadCombine)
-    PM.add(createLoadCombinePass());
-
   // Cleanup and simplify the code after the scalar optimizations.
   addInstructionCombiningPass(PM);
   addExtensionsToPM(EP_Peephole, PM);
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 67bc8f5f6b7ad..656421ee58df8 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -690,6 +690,9 @@ bool SampleProfileLoader::inlineHotFunctions(
     for (auto I : CIS) {
       InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr);
       Function *CalledFunction = CallSite(I).getCalledFunction();
+      // Do not inline recursive calls.
+      if (CalledFunction == &F)
+        continue;
       Instruction *DI = I;
       if (!CalledFunction && !PromotedInsns.count(I) &&
           CallSite(I).isIndirectCall())
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 287a5167fe2ae..d5f0dd1914157 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -988,15 +988,24 @@ static Instruction *foldAddWithConstant(BinaryOperator &Add,
     return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
   }
 
-  // Shifts and add used to flip and mask off the low bit:
-  // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
-  const APInt *C3;
-  if (C->isOneValue() &&
-      match(Op0,
-            m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3)))) &&
-      C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
-    Value *NotX = Builder.CreateNot(X);
-    return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
+  if (C->isOneValue() && Op0->hasOneUse()) {
+    // add (sext i1 X), 1 --> zext (not X)
+    // TODO: The smallest IR representation is (select X, 0, 1), and that would
+    // not require the one-use check. But we need to remove a transform in
+    // visitSelect and make sure that IR value tracking for select is equal or
+    // better than for these ops.
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        X->getType()->getScalarSizeInBits() == 1)
+      return new ZExtInst(Builder.CreateNot(X), Ty);
+
+    // Shifts and add used to flip and mask off the low bit:
+    // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
+    const APInt *C3;
+    if (match(Op0, m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3))) &&
+        C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
+      Value *NotX = Builder.CreateNot(X);
+      return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
+    }
   }
 
   return nullptr;
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index a881bda5ba98d..d3d8cefe97353 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1097,20 +1097,11 @@ static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
   Type *DestTy = Logic.getType();
   Type *SrcTy = Cast->getSrcTy();
 
-  // If the first operand is bitcast, move the logic operation ahead of the
-  // bitcast (do the logic operation in the original type). This can eliminate
-  // bitcasts and allow combines that would otherwise be impeded by the bitcast.
+  // Move the logic operation ahead of a zext if the constant is unchanged in
+  // the smaller source type. Performing the logic in a smaller type may provide
+  // more information to later folds, and the smaller logic instruction may be
+  // cheaper (particularly in the case of vectors).
   Value *X;
-  if (match(Cast, m_BitCast(m_Value(X)))) {
-    Value *NewConstant = ConstantExpr::getBitCast(C, SrcTy);
-    Value *NewOp = Builder->CreateBinOp(LogicOpc, X, NewConstant);
-    return CastInst::CreateBitOrPointerCast(NewOp, DestTy);
-  }
-
-  // Similarly, move the logic operation ahead of a zext if the constant is
-  // unchanged in the smaller source type. Performing the logic in a smaller
-  // type may provide more information to later folds, and the smaller logic
-  // instruction may be cheaper (particularly in the case of vectors).
   if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) {
     Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy);
     Constant *ZextTruncC = ConstantExpr::getZExt(TruncC, DestTy);
@@ -1239,9 +1230,10 @@ static Instruction *foldAndToXor(BinaryOperator &I,
   // (A | ~B) & (B | ~A) --> ~(A ^ B)
   // (~B | A) & (~A | B) --> ~(A ^ B)
   // (~B | A) & (B | ~A) --> ~(A ^ B)
-  if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
-      match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B))))
-    return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+  if (Op0->hasOneUse() || Op1->hasOneUse())
+    if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B))))
+      return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
 
   return nullptr;
 }
@@ -1256,9 +1248,10 @@ static Instruction *foldOrToXor(BinaryOperator &I,
   // Operand complexity canonicalization guarantees that the 'and' is Op0.
   // (A & B) | ~(A | B) --> ~(A ^ B)
   // (A & B) | ~(B | A) --> ~(A ^ B)
-  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
-      match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
-    return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+  if (Op0->hasOneUse() || Op1->hasOneUse())
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
+      return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
 
   // (A & ~B) | (~A & B) --> A ^ B
   // (A & ~B) | (B & ~A) --> A ^ B
@@ -1442,13 +1435,13 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
     if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
       if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
-        if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse())
+        if (Op1->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))
           return BinaryOperator::CreateAnd(Op0, Builder->CreateNot(C));
 
     // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C
     if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
       if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
-        if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse())
+        if (Op0->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))
           return BinaryOperator::CreateAnd(Op1, Builder->CreateNot(C));
 
     // (A | B) & ((~A) ^ B) -> (A & B)
@@ -1579,11 +1572,14 @@ static Value *getSelectCondition(Value *A, Value *B,
 
   // If A and B are sign-extended, look through the sexts to find the booleans.
   Value *Cond;
+  Value *NotB;
   if (match(A, m_SExt(m_Value(Cond))) &&
       Cond->getType()->getScalarType()->isIntegerTy(1) &&
-      match(B, m_CombineOr(m_Not(m_SExt(m_Specific(Cond))),
-                           m_SExt(m_Not(m_Specific(Cond))))))
-    return Cond;
+      match(B, m_OneUse(m_Not(m_Value(NotB))))) {
+    NotB = peekThroughBitcast(NotB, true);
+    if (match(NotB, m_SExt(m_Specific(Cond))))
+      return Cond;
+  }
 
   // All scalar (and most vector) possibilities should be handled now.
   // Try more matches that only apply to non-splat constant vectors.
@@ -1615,12 +1611,8 @@ static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
   // The potential condition of the select may be bitcasted. In that case, look
   // through its bitcast and the corresponding bitcast of the 'not' condition.
   Type *OrigType = A->getType();
-  Value *SrcA, *SrcB;
-  if (match(A, m_OneUse(m_BitCast(m_Value(SrcA)))) &&
-      match(B, m_OneUse(m_BitCast(m_Value(SrcB))))) {
-    A = SrcA;
-    B = SrcB;
-  }
+  A = peekThroughBitcast(A, true);
+  B = peekThroughBitcast(B, true);
 
   if (Value *Cond = getSelectCondition(A, B, Builder)) {
     // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
@@ -1922,8 +1914,9 @@ Value *InstCombiner::foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
 ///     (A & C1) | B
 ///
 /// when the XOR of the two constants is "all ones" (-1).
-Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op,
-                                               Value *A, Value *B, Value *C) {
+static Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op,
+                                        Value *A, Value *B, Value *C,
+                                        InstCombiner::BuilderTy *Builder) {
   ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
   if (!CI1) return nullptr;
 
@@ -1944,15 +1937,16 @@ Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op,
 
 /// \brief This helper function folds:
 ///
-///     ((A | B) & C1) ^ (B & C2)
+///     ((A ^ B) & C1) | (B & C2)
 ///
 /// into:
 ///
 ///     (A & C1) ^ B
 ///
 /// when the XOR of the two constants is "all ones" (-1).
-Instruction *InstCombiner::FoldXorWithConstants(BinaryOperator &I, Value *Op,
-                                                Value *A, Value *B, Value *C) {
+static Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op,
+                                         Value *A, Value *B, Value *C,
+                                         InstCombiner::BuilderTy *Builder) {
   ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
   if (!CI1)
     return nullptr;
@@ -2112,46 +2106,36 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     }
 
     // ((A|B)&1)|(B&-2) -> (A&1) | B
-    if (match(A, m_Or(m_Value(V1), m_Specific(B))) ||
-        match(A, m_Or(m_Specific(B), m_Value(V1)))) {
-      Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C);
-      if (Ret) return Ret;
+    if (match(A, m_c_Or(m_Value(V1), m_Specific(B)))) {
+      if (Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C, Builder))
+        return Ret;
     }
     // (B&-2)|((A|B)&1) -> (A&1) | B
-    if (match(B, m_Or(m_Specific(A), m_Value(V1))) ||
-        match(B, m_Or(m_Value(V1), m_Specific(A)))) {
-      Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D);
-      if (Ret) return Ret;
+    if (match(B, m_c_Or(m_Specific(A), m_Value(V1)))) {
+      if (Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D, Builder))
+        return Ret;
     }
     // ((A^B)&1)|(B&-2) -> (A&1) ^ B
-    if (match(A, m_Xor(m_Value(V1), m_Specific(B))) ||
-        match(A, m_Xor(m_Specific(B), m_Value(V1)))) {
-      Instruction *Ret = FoldXorWithConstants(I, Op1, V1, B, C);
-      if (Ret) return Ret;
+    if (match(A, m_c_Xor(m_Value(V1), m_Specific(B)))) {
+      if (Instruction *Ret = FoldXorWithConstants(I, Op1, V1, B, C, Builder))
+        return Ret;
     }
     // (B&-2)|((A^B)&1) -> (A&1) ^ B
-    if (match(B, m_Xor(m_Specific(A), m_Value(V1))) ||
-        match(B, m_Xor(m_Value(V1), m_Specific(A)))) {
-      Instruction *Ret = FoldXorWithConstants(I, Op0, A, V1, D);
-      if (Ret) return Ret;
+    if (match(B, m_c_Xor(m_Specific(A), m_Value(V1)))) {
+      if (Instruction *Ret = FoldXorWithConstants(I, Op0, A, V1, D, Builder))
+        return Ret;
     }
   }
 
   // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C
-  // FIXME: The two hasOneUse calls here are the same call, maybe we were
-  // supposed to check Op1->operand(0)?
   if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
     if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
-      if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse())
-        return BinaryOperator::CreateOr(Op0, C);
+      return BinaryOperator::CreateOr(Op0, C);
 
   // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C
-  // FIXME: The two hasOneUse calls here are the same call, maybe we were
-  // supposed to check Op0->operand(0)?
   if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
     if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
-      if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse())
-        return BinaryOperator::CreateOr(Op1, C);
+      return BinaryOperator::CreateOr(Op1, C);
 
   // ((B | C) & A) | B -> B | (A & C)
   if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
@@ -2357,6 +2341,30 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     }
   }
 
+  // Instead of trying to imitate the folds for and/or, decompose this 'xor'
+  // into those logic ops. That is, try to turn this into an and-of-icmps
+  // because we have many folds for that pattern.
+  //
+  // This is based on a truth table definition of xor:
+  // X ^ Y --> (X | Y) & !(X & Y)
+  if (Value *OrICmp = SimplifyBinOp(Instruction::Or, LHS, RHS, SQ)) {
+    // TODO: If OrICmp is true, then the definition of xor simplifies to !(X&Y).
+    // TODO: If OrICmp is false, the whole thing is false (InstSimplify?).
+    if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) {
+      // TODO: Independently handle cases where the 'and' side is a constant.
+      if (OrICmp == LHS && AndICmp == RHS && RHS->hasOneUse()) {
+        // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS
+        RHS->setPredicate(RHS->getInversePredicate());
+        return Builder->CreateAnd(LHS, RHS);
+      }
+      if (OrICmp == RHS && AndICmp == LHS && LHS->hasOneUse()) {
+        // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS
+        LHS->setPredicate(LHS->getInversePredicate());
+        return Builder->CreateAnd(LHS, RHS);
+      }
+    }
+  }
+
   return nullptr;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c0830a5d21124..dbed7ad4eae84 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1409,6 +1409,47 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
     }
   }
 
+  // Add range metadata since known bits can't completely reflect what we know.
+  // TODO: Handle splat vectors.
+  auto *IT = dyn_cast<IntegerType>(Op0->getType());
+  if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
+    Metadata *LowAndHigh[] = {
+        ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)),
+        ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))};
+    II.setMetadata(LLVMContext::MD_range,
+                   MDNode::get(II.getContext(), LowAndHigh));
+    return &II;
+  }
+
+  return nullptr;
+}
+
+static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) {
+  assert(II.getIntrinsicID() == Intrinsic::ctpop &&
+         "Expected ctpop intrinsic");
+  Value *Op0 = II.getArgOperand(0);
+  // FIXME: Try to simplify vectors of integers.
+  auto *IT = dyn_cast<IntegerType>(Op0->getType());
+  if (!IT)
+    return nullptr;
+
+  unsigned BitWidth = IT->getBitWidth();
+  KnownBits Known(BitWidth);
+  IC.computeKnownBits(Op0, Known, 0, &II);
+
+  unsigned MinCount = Known.countMinPopulation();
+  unsigned MaxCount = Known.countMaxPopulation();
+
+  // Add range metadata since known bits can't completely reflect what we know.
+  if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
+    Metadata *LowAndHigh[] = {
+        ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)),
+        ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))};
+    II.setMetadata(LLVMContext::MD_range,
+                   MDNode::get(II.getContext(), LowAndHigh));
+    return &II;
+  }
+
   return nullptr;
 }
 
@@ -1981,6 +2022,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return I;
     break;
 
+  case Intrinsic::ctpop:
+    if (auto *I = foldCtpop(*II, *this))
+      return I;
+    break;
+
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::umul_with_overflow:
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 38e95fb116396..d3049389dfb9f 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1896,6 +1896,18 @@ static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
     return BinaryOperator::Create(BO->getOpcode(), CastedOp0, X);
   }
 
+  // Canonicalize vector bitcasts to come before vector bitwise logic with a
+  // constant. This eases recognition of special constants for later ops.
+  // Example:
+  // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+  Constant *C;
+  if (match(BO->getOperand(1), m_Constant(C))) {
+    // bitcast (logic X, C) --> logic (bitcast X, C')
+    Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
+    Value *CastedC = ConstantExpr::getBitCast(C, DestTy);
+    return BinaryOperator::Create(BO->getOpcode(), CastedOp0, CastedC);
+  }
+
   return nullptr;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 1ef4acfb058c4..6ad32490a3288 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2434,6 +2434,77 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
   return nullptr;
 }
 
+bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
+                                           Value *&RHS, ConstantInt *&Less,
+                                           ConstantInt *&Equal,
+                                           ConstantInt *&Greater) {
+  // TODO: Generalize this to work with other comparison idioms or ensure
+  // they get canonicalized into this form.
+
+  // select i1 (a == b), i32 Equal, i32 (select i1 (a < b), i32 Less, i32
+  // Greater), where Equal, Less and Greater are placeholders for any three
+  // constants.
+  ICmpInst::Predicate PredA, PredB;
+  if (match(SI->getTrueValue(), m_ConstantInt(Equal)) &&
+      match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) &&
+      PredA == ICmpInst::ICMP_EQ &&
+      match(SI->getFalseValue(),
+            m_Select(m_ICmp(PredB, m_Specific(LHS), m_Specific(RHS)),
+                     m_ConstantInt(Less), m_ConstantInt(Greater))) &&
+      PredB == ICmpInst::ICMP_SLT) {
+    return true;
+  }
+  return false;
+}
+
+Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
+                                                  Instruction *Select,
+                                                  ConstantInt *C) {
+
+  assert(C && "Cmp RHS should be a constant int!");
+  // If we're testing a constant value against the result of a three way
+  // comparison, the result can be expressed directly in terms of the
+  // original values being compared.  Note: We could possibly be more
+  // aggressive here and remove the hasOneUse test. The original select is
+  // really likely to simplify or sink when we remove a test of the result.
+  Value *OrigLHS, *OrigRHS;
+  ConstantInt *C1LessThan, *C2Equal, *C3GreaterThan;
+  if (Cmp.hasOneUse() &&
+      matchThreeWayIntCompare(cast<SelectInst>(Select), OrigLHS, OrigRHS,
+                                 C1LessThan, C2Equal, C3GreaterThan)) {
+    assert(C1LessThan && C2Equal && C3GreaterThan);
+
+    bool TrueWhenLessThan =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C)
+            ->isAllOnesValue();
+    bool TrueWhenEqual =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C)
+            ->isAllOnesValue();
+    bool TrueWhenGreaterThan =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C)
+            ->isAllOnesValue();
+
+    // This generates the new instruction that will replace the original Cmp
+    // Instruction. Instead of enumerating the various combinations when
+    // TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus
+    // false, we rely on chaining of ORs and future passes of InstCombine to
+    // simplify the OR further (i.e. a s< b || a == b becomes a s<= b).
+
+    // When none of the three constants satisfy the predicate for the RHS (C),
+    // the entire original Cmp can be simplified to a false.
+    Value *Cond = Builder->getFalse();
+    if (TrueWhenLessThan)
+      Cond = Builder->CreateOr(Cond, Builder->CreateICmp(ICmpInst::ICMP_SLT, OrigLHS, OrigRHS));
+    if (TrueWhenEqual)
+      Cond = Builder->CreateOr(Cond, Builder->CreateICmp(ICmpInst::ICMP_EQ, OrigLHS, OrigRHS));
+    if (TrueWhenGreaterThan)
+      Cond = Builder->CreateOr(Cond, Builder->CreateICmp(ICmpInst::ICMP_SGT, OrigLHS, OrigRHS));
+
+    return replaceInstUsesWith(Cmp, Cond);
+  }
+  return nullptr;
+}
+
 /// Try to fold integer comparisons with a constant operand: icmp Pred X, C
 /// where X is some kind of instruction.
 Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
@@ -2493,11 +2564,28 @@ Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
       return I;
   }
 
+  // Match against CmpInst LHS being instructions other than binary operators.
   Instruction *LHSI;
-  if (match(Cmp.getOperand(0), m_Instruction(LHSI)) &&
-      LHSI->getOpcode() == Instruction::Trunc)
-    if (Instruction *I = foldICmpTruncConstant(Cmp, LHSI, C))
-      return I;
+  if (match(Cmp.getOperand(0), m_Instruction(LHSI))) {
+    switch (LHSI->getOpcode()) {
+    case Instruction::Select:
+      {
+      // For now, we only support constant integers while folding the
+      // ICMP(SELECT)) pattern. We can extend this to support vector of integers
+      // similar to the cases handled by binary ops above.
+      if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1)))
+        if (Instruction *I = foldICmpSelectConstant(Cmp, LHSI, ConstRHS))
+          return I;
+      break;
+      }
+    case Instruction::Trunc:
+      if (Instruction *I = foldICmpTruncConstant(Cmp, LHSI, C))
+        return I;
+      break;
+    default:
+      break;
+    }
+  }
 
   if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, C))
     return I;
@@ -3110,8 +3198,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   if (BO0) {
     // Transform  A & (L - 1) `ult` L --> L != 0
     auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes());
-    auto BitwiseAnd =
-        m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value()));
+    auto BitwiseAnd = m_c_And(m_Value(), LSubOne);
 
     if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
       auto *Zero = Constant::getNullValue(BO0->getType());
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 1a7db146df426..1b0fe84dd4dda 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -95,6 +95,18 @@ static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) {
   }
 }
 
+/// Return the source operand of a potentially bitcasted value while optionally
+/// checking if it has one use. If there is no bitcast or the one use check is
+/// not met, return the input value itself.
+static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) {
+  if (auto *BitCast = dyn_cast<BitCastInst>(V))
+    if (!OneUseOnly || BitCast->hasOneUse())
+      return BitCast->getOperand(0);
+
+  // V is not a bitcast or V has more than one use and OneUseOnly is true.
+  return V;
+}
+
 /// \brief Add one to a Constant
 static inline Constant *AddOne(Constant *C) {
   return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
@@ -276,10 +288,6 @@ public:
   Instruction *visitFDiv(BinaryOperator &I);
   Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted);
   Instruction *visitAnd(BinaryOperator &I);
-  Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A,
-                                   Value *B, Value *C);
-  Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, Value *A,
-                                    Value *B, Value *C);
   Instruction *visitOr(BinaryOperator &I);
   Instruction *visitXor(BinaryOperator &I);
   Instruction *visitShl(BinaryOperator &I);
@@ -595,6 +603,15 @@ private:
                           Instruction::BinaryOps, Value *, Value *, Value *,
                           Value *);
 
+  /// Match a select chain which produces one of three values based on whether
+  /// the LHS is less than, equal to, or greater than RHS respectively.
+  /// Return true if we matched a three way compare idiom. The LHS, RHS, Less,
+  /// Equal and Greater values are saved in the matching process and returned to
+  /// the caller.
+  bool matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, Value *&RHS,
+                               ConstantInt *&Less, ConstantInt *&Equal,
+                               ConstantInt *&Greater);
+
   /// \brief Attempts to replace V with a simpler value based on the demanded
   /// bits.
   Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known,
@@ -672,6 +689,8 @@ private:
   Instruction *foldICmpBinOp(ICmpInst &Cmp);
   Instruction *foldICmpEquality(ICmpInst &Cmp);
 
+  Instruction *foldICmpSelectConstant(ICmpInst &Cmp, Instruction *Select,
+                                      ConstantInt *C);
   Instruction *foldICmpTruncConstant(ICmpInst &Cmp, Instruction *Trunc,
                                      const APInt *C);
   Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And,
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index a4d84ae81aa02..ca370c73fca44 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -169,6 +169,18 @@ isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
   return nullptr;
 }
 
+/// Returns true if V is dereferenceable for size of alloca.
+static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
+                                           const DataLayout &DL) {
+  if (AI->isArrayAllocation())
+    return false;
+  uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
+  if (!AllocaSize)
+    return false;
+  return isDereferenceableAndAlignedPointer(V, AI->getAlignment(),
+                                            APInt(64, AllocaSize), DL);
+}
+
 static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
   // Check for array size of 1 (scalar allocation).
   if (!AI.isArrayAllocation()) {
@@ -390,7 +402,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
       unsigned SourceAlign = getOrEnforceKnownAlignment(
           Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT);
-      if (AI.getAlignment() <= SourceAlign) {
+      if (AI.getAlignment() <= SourceAlign &&
+          isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {
         DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
         DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
         for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
@@ -476,21 +489,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
       break;
 
     case LLVMContext::MD_nonnull:
-      // This only directly applies if the new type is also a pointer.
-      if (NewTy->isPointerTy()) {
-        NewLoad->setMetadata(ID, N);
-        break;
-      }
-      // If it's integral now, translate it to !range metadata.
-      if (NewTy->isIntegerTy()) {
-        auto *ITy = cast<IntegerType>(NewTy);
-        auto *NullInt = ConstantExpr::getPtrToInt(
-            ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy);
-        auto *NonNullInt =
-            ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1));
-        NewLoad->setMetadata(LLVMContext::MD_range,
-                             MDB.createRange(NonNullInt, NullInt));
-      }
+      copyNonnullMetadata(LI, N, *NewLoad);
       break;
     case LLVMContext::MD_align:
     case LLVMContext::MD_dereferenceable:
@@ -500,17 +499,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
         NewLoad->setMetadata(ID, N);
       break;
     case LLVMContext::MD_range:
-      // FIXME: It would be nice to propagate this in some way, but the type
-      // conversions make it hard.
-
-      // If it's a pointer now and the range does not contain 0, make it !nonnull.
-      if (NewTy->isPointerTy()) {
-        unsigned BitWidth = IC.getDataLayout().getTypeSizeInBits(NewTy);
-        if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
-          MDNode *NN = MDNode::get(LI.getContext(), None);
-          NewLoad->setMetadata(LLVMContext::MD_nonnull, NN);
-        }
-      }
+      copyRangeMetadata(IC.getDataLayout(), LI, N, *NewLoad);
       break;
     }
   }
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index b9674d85634dc..33951e66497a1 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -303,7 +303,7 @@ Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
 /// We want to turn:
 ///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2))
 /// into:
-///   (or (shl (and X, C1), C3), y)
+///   (or (shl (and X, C1), C3), Y)
 /// iff:
 ///   C1 and C2 are both powers of 2
 /// where:
@@ -317,19 +317,44 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
                                   Value *FalseVal,
                                   InstCombiner::BuilderTy *Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
-  if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy())
+  if (!IC || !SI.getType()->isIntegerTy())
     return nullptr;
 
   Value *CmpLHS = IC->getOperand(0);
   Value *CmpRHS = IC->getOperand(1);
 
-  if (!match(CmpRHS, m_Zero()))
-    return nullptr;
+  Value *V;
+  unsigned C1Log;
+  bool IsEqualZero;
+  bool NeedAnd = false;
+  if (IC->isEquality()) {
+    if (!match(CmpRHS, m_Zero()))
+      return nullptr;
+
+    const APInt *C1;
+    if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1))))
+      return nullptr;
+
+    V = CmpLHS;
+    C1Log = C1->logBase2();
+    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_EQ;
+  } else if (IC->getPredicate() == ICmpInst::ICMP_SLT ||
+             IC->getPredicate() == ICmpInst::ICMP_SGT) {
+    // We also need to recognize (icmp slt (trunc (X)), 0) and
+    // (icmp sgt (trunc (X)), -1).
+    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_SGT;
+    if ((IsEqualZero && !match(CmpRHS, m_AllOnes())) ||
+        (!IsEqualZero && !match(CmpRHS, m_Zero())))
+      return nullptr;
+
+    if (!match(CmpLHS, m_OneUse(m_Trunc(m_Value(V)))))
+      return nullptr;
 
-  Value *X;
-  const APInt *C1;
-  if (!match(CmpLHS, m_And(m_Value(X), m_Power2(C1))))
+    C1Log = CmpLHS->getType()->getScalarSizeInBits() - 1;
+    NeedAnd = true;
+  } else {
     return nullptr;
+  }
 
   const APInt *C2;
   bool OrOnTrueVal = false;
@@ -340,11 +365,27 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
   if (!OrOnFalseVal && !OrOnTrueVal)
     return nullptr;
 
-  Value *V = CmpLHS;
   Value *Y = OrOnFalseVal ? TrueVal : FalseVal;
 
-  unsigned C1Log = C1->logBase2();
   unsigned C2Log = C2->logBase2();
+
+  bool NeedXor = (!IsEqualZero && OrOnFalseVal) || (IsEqualZero && OrOnTrueVal);
+  bool NeedShift = C1Log != C2Log;
+  bool NeedZExtTrunc = Y->getType()->getIntegerBitWidth() !=
+                       V->getType()->getIntegerBitWidth();
+
+  // Make sure we don't create more instructions than we save.
+  Value *Or = OrOnFalseVal ? FalseVal : TrueVal;
+  if ((NeedShift + NeedXor + NeedZExtTrunc) >
+      (IC->hasOneUse() + Or->hasOneUse()))
+    return nullptr;
+
+  if (NeedAnd) {
+    // Insert the AND instruction on the input to the truncate.
+    APInt C1 = APInt::getOneBitSet(V->getType()->getScalarSizeInBits(), C1Log);
+    V = Builder->CreateAnd(V, ConstantInt::get(V->getType(), C1));
+  }
+
   if (C2Log > C1Log) {
     V = Builder->CreateZExtOrTrunc(V, Y->getType());
     V = Builder->CreateShl(V, C2Log - C1Log);
@@ -354,9 +395,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
   } else
     V = Builder->CreateZExtOrTrunc(V, Y->getType());
 
-  ICmpInst::Predicate Pred = IC->getPredicate();
-  if ((Pred == ICmpInst::ICMP_NE && OrOnFalseVal) ||
-      (Pred == ICmpInst::ICMP_EQ && OrOnTrueVal))
+  if (NeedXor)
     V = Builder->CreateXor(V, *C2);
 
   return Builder->CreateOr(V, Y);
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 8cec865c6422a..1bb1a85367d1b 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -556,8 +556,7 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
     // The inexact versions are deferred to DAGCombine, so we don't hide shl
     // behind a bit mask.
     const APInt *ShOp1;
-    if (match(Op0, m_CombineOr(m_Exact(m_LShr(m_Value(X), m_APInt(ShOp1))),
-                               m_Exact(m_AShr(m_Value(X), m_APInt(ShOp1)))))) {
+    if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1))))) {
       unsigned ShrAmt = ShOp1->getZExtValue();
       if (ShrAmt < ShAmt) {
         // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1)
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 65e6d2e359052..02fac4fb37a4b 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -939,9 +939,19 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
       // `TrueVInPred`.
       if (InC && !isa<ConstantExpr>(InC) && isa<ConstantInt>(InC))
         InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
-      else
+      else {
+        // Generate the select in the same block as PN's current incoming block.
+        // Note: ThisBB need not be the NonConstBB because vector constants
+        // which are constants by definition are handled here.
+        // FIXME: This can lead to an increase in IR generation because we might
+        // generate selects for vector constant phi operand, that could not be
+        // folded to TrueVInPred or FalseVInPred as done for ConstantInt. For
+        // non-vector phis, this transformation was always profitable because
+        // the select would be generated exactly once in the NonConstBB.
+        Builder->SetInsertPoint(ThisBB->getTerminator());
         InV = Builder->CreateSelect(PN->getIncomingValue(i),
                                     TrueVInPred, FalseVInPred, "phitmp");
+      }
       NewPN->addIncoming(InV, ThisBB);
     }
   } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) {
@@ -3002,6 +3012,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
         ++NumDeadInst;
         DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
         Inst->eraseFromParent();
+        MadeIRChange = true;
         continue;
       }
 
@@ -3015,6 +3026,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
           ++NumConstProp;
           if (isInstructionTriviallyDead(Inst, TLI))
             Inst->eraseFromParent();
+          MadeIRChange = true;
           continue;
         }
 
diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 0d308810009d5..4089d81ea3e1b 100644
--- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -642,7 +642,12 @@ static bool promoteIndirectCalls(Module &M, bool InLTO, bool SamplePGO) {
   if (DisableICP)
     return false;
   InstrProfSymtab Symtab;
-  Symtab.create(M, InLTO);
+  if (Error E = Symtab.create(M, InLTO)) {
+    std::string SymtabFailure = toString(std::move(E));
+    DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n");
+    (void)SymtabFailure;
+    return false;
+  }
   bool Changed = false;
   for (auto &F : M) {
     if (F.isDeclaration())
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 37f88d5f95f18..9c14b0149fdc1 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -19,12 +19,14 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -40,7 +42,10 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -92,6 +97,35 @@ cl::opt<double> NumCountersPerValueSite(
     // is usually smaller than 2.
     cl::init(1.0));
 
+cl::opt<bool> AtomicCounterUpdatePromoted(
+    "atomic-counter-update-promoted", cl::ZeroOrMore,
+    cl::desc("Do counter update using atomic fetch add "
+             " for promoted counters only"),
+    cl::init(false));
+
+// If the option is not specified, the default behavior about whether
+// counter promotion is done depends on how instrumentaiton lowering
+// pipeline is setup, i.e., the default value of true of this option
+// does not mean the promotion will be done by default. Explicitly
+// setting this option can override the default behavior.
+cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore,
+                                 cl::desc("Do counter register promotion"),
+                                 cl::init(false));
+cl::opt<unsigned> MaxNumOfPromotionsPerLoop(
+    cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(10),
+    cl::desc("Max number counter promotions per loop to avoid"
+             " increasing register pressure too much"));
+
+// A debug option
+cl::opt<int>
+    MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1),
+                       cl::desc("Max number of allowed counter promotions"));
+
+cl::opt<bool> SpeculativeCounterPromotion(
+    cl::ZeroOrMore, "speculative-counter-promotion", cl::init(false),
+    cl::desc("Allow counter promotion for loops with multiple exiting blocks "
+             " or top-tested loops. "));
+
 class InstrProfilingLegacyPass : public ModulePass {
   InstrProfiling InstrProf;
 
@@ -116,6 +150,123 @@ public:
   }
 };
 
+/// A helper class to promote one counter RMW operation in the loop
+/// into register update.
+///
+/// RWM update for the counter will be sinked out of the loop after
+/// the transformation.
+///
+class PGOCounterPromoterHelper : public LoadAndStorePromoter {
+public:
+  PGOCounterPromoterHelper(Instruction *L, Instruction *S, SSAUpdater &SSA,
+                           Value *Init, BasicBlock *PH,
+                           ArrayRef<BasicBlock *> ExitBlocks,
+                           ArrayRef<Instruction *> InsertPts)
+      : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks),
+        InsertPts(InsertPts) {
+    assert(isa<LoadInst>(L));
+    assert(isa<StoreInst>(S));
+    SSA.AddAvailableValue(PH, Init);
+  }
+  void doExtraRewritesBeforeFinalDeletion() const override {
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitBlock = ExitBlocks[i];
+      Instruction *InsertPos = InsertPts[i];
+      // Get LiveIn value into the ExitBlock. If there are multiple
+      // predecessors, the value is defined by a PHI node in this
+      // block.
+      Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+      Value *Addr = cast<StoreInst>(Store)->getPointerOperand();
+      IRBuilder<> Builder(InsertPos);
+      if (AtomicCounterUpdatePromoted)
+        Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue,
+                                AtomicOrdering::SequentiallyConsistent);
+      else {
+        LoadInst *OldVal = Builder.CreateLoad(Addr, "pgocount.promoted");
+        auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue);
+        Builder.CreateStore(NewVal, Addr);
+      }
+    }
+  }
+
+private:
+  Instruction *Store;
+  ArrayRef<BasicBlock *> ExitBlocks;
+  ArrayRef<Instruction *> InsertPts;
+};
+
+/// A helper class to do register promotion for all profile counter
+/// updates in a loop.
+///
+class PGOCounterPromoter {
+public:
+  PGOCounterPromoter(ArrayRef<LoadStorePair> Cands, Loop &Loop)
+      : Candidates(Cands), ExitBlocks(), InsertPts(), ParentLoop(Loop) {
+
+    SmallVector<BasicBlock *, 8> LoopExitBlocks;
+    SmallPtrSet<BasicBlock *, 8> BlockSet;
+    ParentLoop.getExitBlocks(LoopExitBlocks);
+
+    for (BasicBlock *ExitBlock : LoopExitBlocks) {
+      if (BlockSet.insert(ExitBlock).second) {
+        ExitBlocks.push_back(ExitBlock);
+        InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+      }
+    }
+  }
+
+  bool run(int64_t *NumPromoted) {
+    // We can't insert into a catchswitch.
+    bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) {
+      return isa<CatchSwitchInst>(Exit->getTerminator());
+    });
+
+    if (HasCatchSwitch)
+      return false;
+
+    if (!ParentLoop.hasDedicatedExits())
+      return false;
+
+    BasicBlock *PH = ParentLoop.getLoopPreheader();
+    if (!PH)
+      return false;
+
+    BasicBlock *H = ParentLoop.getHeader();
+    bool TopTested =
+        ((ParentLoop.getBlocks().size() > 1) && ParentLoop.isLoopExiting(H));
+    if (!SpeculativeCounterPromotion &&
+        (TopTested || ParentLoop.getExitingBlock() == nullptr))
+      return false;
+
+    unsigned Promoted = 0;
+    for (auto &Cand : Candidates) {
+
+      SmallVector<PHINode *, 4> NewPHIs;
+      SSAUpdater SSA(&NewPHIs);
+      Value *InitVal = ConstantInt::get(Cand.first->getType(), 0);
+      PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal,
+                                        PH, ExitBlocks, InsertPts);
+      Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second}));
+      Promoted++;
+      if (Promoted >= MaxNumOfPromotionsPerLoop)
+        break;
+      (*NumPromoted)++;
+      if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions)
+        break;
+    }
+
+    DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
+                 << ParentLoop.getLoopDepth() << ")\n");
+    return Promoted != 0;
+  }
+
+private:
+  ArrayRef<LoadStorePair> Candidates;
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  SmallVector<Instruction *, 8> InsertPts;
+  Loop &ParentLoop;
+};
+
 } // end anonymous namespace
 
 PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
@@ -147,6 +298,63 @@ static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
   return dyn_cast<InstrProfIncrementInst>(Instr);
 }
 
+bool InstrProfiling::lowerIntrinsics(Function *F) {
+  bool MadeChange = false;
+  PromotionCandidates.clear();
+  for (BasicBlock &BB : *F) {
+    for (auto I = BB.begin(), E = BB.end(); I != E;) {
+      auto Instr = I++;
+      InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr);
+      if (Inc) {
+        lowerIncrement(Inc);
+        MadeChange = true;
+      } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
+        lowerValueProfileInst(Ind);
+        MadeChange = true;
+      }
+    }
+  }
+
+  if (!MadeChange)
+    return false;
+
+  promoteCounterLoadStores(F);
+  return true;
+}
+
+bool InstrProfiling::isCounterPromotionEnabled() const {
+  if (DoCounterPromotion.getNumOccurrences() > 0)
+    return DoCounterPromotion;
+
+  return Options.DoCounterPromotion;
+}
+
+void InstrProfiling::promoteCounterLoadStores(Function *F) {
+  if (!isCounterPromotionEnabled())
+    return;
+
+  DominatorTree DT(*F);
+  LoopInfo LI(DT);
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> LoopPromotionCandidates;
+
+  for (const auto &LoadStore : PromotionCandidates) {
+    auto *CounterLoad = LoadStore.first;
+    auto *CounterStore = LoadStore.second;
+    BasicBlock *BB = CounterLoad->getParent();
+    Loop *ParentLoop = LI.getLoopFor(BB);
+    if (!ParentLoop)
+      continue;
+    LoopPromotionCandidates[ParentLoop].emplace_back(CounterLoad, CounterStore);
+  }
+
+  SmallVector<Loop *, 4> Loops = LI.getLoopsInPreorder();
+
+  for (auto *Loop : Loops) {
+    PGOCounterPromoter Promoter(LoopPromotionCandidates[Loop], *Loop);
+    Promoter.run(&TotalCountersPromoted);
+  }
+}
+
 bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
   bool MadeChange = false;
 
@@ -179,18 +387,7 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
   }
 
   for (Function &F : M)
-    for (BasicBlock &BB : F)
-      for (auto I = BB.begin(), E = BB.end(); I != E;) {
-        auto Instr = I++;
-        InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr);
-        if (Inc) {
-          lowerIncrement(Inc);
-          MadeChange = true;
-        } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
-          lowerValueProfileInst(Ind);
-          MadeChange = true;
-        }
-      }
+    MadeChange |= lowerIntrinsics(&F);
 
   if (GlobalVariable *CoverageNamesVar =
           M.getNamedGlobal(getCoverageUnusedNamesVarName())) {
@@ -303,9 +500,12 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
   IRBuilder<> Builder(Inc);
   uint64_t Index = Inc->getIndex()->getZExtValue();
   Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index);
-  Value *Count = Builder.CreateLoad(Addr, "pgocount");
-  Count = Builder.CreateAdd(Count, Inc->getStep());
-  Inc->replaceAllUsesWith(Builder.CreateStore(Count, Addr));
+  Value *Load = Builder.CreateLoad(Addr, "pgocount");
+  auto *Count = Builder.CreateAdd(Load, Inc->getStep());
+  auto *Store = Builder.CreateStore(Count, Addr);
+  Inc->replaceAllUsesWith(Store);
+  if (isCounterPromotionEnabled())
+    PromotionCandidates.emplace_back(cast<Instruction>(Load), Store);
   Inc->eraseFromParent();
 }
 
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index b2d95271479c3..0e7d11c553977 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1177,7 +1177,7 @@ void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) {
   Builder.CreateCall(
       Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
       {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
-       Builder.getInt64(FuncHash), Builder.CreatePtrToInt(Length, Int64Ty),
+       Builder.getInt64(FuncHash), Builder.CreateZExtOrTrunc(Length, Int64Ty),
        Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)});
   ++CurCtrId;
 }
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index f5196cc461815..457c9427ab9ac 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -22,7 +22,6 @@ add_llvm_library(LLVMScalarOpts
   LICM.cpp
   LoopAccessAnalysisPrinter.cpp
   LoopSink.cpp
-  LoadCombine.cpp
   LoopDeletion.cpp
   LoopDataPrefetch.cpp
   LoopDistribute.cpp
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 2a4c9526dfcd9..28157783daa7a 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -232,8 +232,7 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
   pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
   if (PB == PE) return false;
 
-  // Analyse each switch case in turn.  This is done in reverse order so that
-  // removing a case doesn't cause trouble for the iteration.
+  // Analyse each switch case in turn.
   bool Changed = false;
   for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
     ConstantInt *Case = CI->getCaseValue();
@@ -291,7 +290,7 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
       break;
     }
 
-    // Increment the case iterator sense we didn't delete it.
+    // Increment the case iterator since we didn't delete it.
     ++CI;
   }
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 0490d93f64553..c0f628eb61e61 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -80,9 +80,10 @@ MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
 struct llvm::GVN::Expression {
   uint32_t opcode;
   Type *type;
+  bool commutative;
   SmallVector<uint32_t, 4> varargs;
 
-  Expression(uint32_t o = ~2U) : opcode(o) {}
+  Expression(uint32_t o = ~2U) : opcode(o), commutative(false) {}
 
   bool operator==(const Expression &other) const {
     if (opcode != other.opcode)
@@ -246,6 +247,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
     assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
     if (e.varargs[0] > e.varargs[1])
       std::swap(e.varargs[0], e.varargs[1]);
+    e.commutative = true;
   }
 
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
@@ -256,6 +258,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
       Predicate = CmpInst::getSwappedPredicate(Predicate);
     }
     e.opcode = (C->getOpcode() << 8) | Predicate;
+    e.commutative = true;
   } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
     for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
          II != IE; ++II)
@@ -281,6 +284,7 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
     Predicate = CmpInst::getSwappedPredicate(Predicate);
   }
   e.opcode = (Opcode << 8) | Predicate;
+  e.commutative = true;
   return e;
 }
 
@@ -348,25 +352,25 @@ GVN::ValueTable::~ValueTable() = default;
 /// add - Insert a value into the table with a specified value number.
 void GVN::ValueTable::add(Value *V, uint32_t num) {
   valueNumbering.insert(std::make_pair(V, num));
+  if (PHINode *PN = dyn_cast<PHINode>(V))
+    NumberingPhi[num] = PN;
 }
 
 uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
   if (AA->doesNotAccessMemory(C)) {
     Expression exp = createExpr(C);
-    uint32_t &e = expressionNumbering[exp];
-    if (!e) e = nextValueNumber++;
+    uint32_t e = assignExpNewValueNum(exp).first;
     valueNumbering[C] = e;
     return e;
   } else if (AA->onlyReadsMemory(C)) {
     Expression exp = createExpr(C);
-    uint32_t &e = expressionNumbering[exp];
-    if (!e) {
-      e = nextValueNumber++;
-      valueNumbering[C] = e;
-      return e;
+    auto ValNum = assignExpNewValueNum(exp);
+    if (ValNum.second) {
+      valueNumbering[C] = ValNum.first;
+      return ValNum.first;
     }
     if (!MD) {
-      e = nextValueNumber++;
+      uint32_t e = assignExpNewValueNum(exp).first;
       valueNumbering[C] = e;
       return e;
     }
@@ -522,23 +526,29 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
     case Instruction::ExtractValue:
       exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
       break;
+    case Instruction::PHI:
+      valueNumbering[V] = nextValueNumber;
+      NumberingPhi[nextValueNumber] = cast<PHINode>(V);
+      return nextValueNumber++;
     default:
       valueNumbering[V] = nextValueNumber;
       return nextValueNumber++;
   }
 
-  uint32_t& e = expressionNumbering[exp];
-  if (!e) e = nextValueNumber++;
+  uint32_t e = assignExpNewValueNum(exp).first;
   valueNumbering[V] = e;
   return e;
 }
 
 /// Returns the value number of the specified value. Fails if
 /// the value has not yet been numbered.
-uint32_t GVN::ValueTable::lookup(Value *V) const {
+uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const {
   DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
-  assert(VI != valueNumbering.end() && "Value not numbered?");
-  return VI->second;
+  if (Verify) {
+    assert(VI != valueNumbering.end() && "Value not numbered?");
+    return VI->second;
+  }
+  return (VI != valueNumbering.end()) ? VI->second : 0;
 }
 
 /// Returns the value number of the given comparison,
@@ -549,21 +559,28 @@ uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,
                                          CmpInst::Predicate Predicate,
                                          Value *LHS, Value *RHS) {
   Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
-  uint32_t& e = expressionNumbering[exp];
-  if (!e) e = nextValueNumber++;
-  return e;
+  return assignExpNewValueNum(exp).first;
 }
 
 /// Remove all entries from the ValueTable.
 void GVN::ValueTable::clear() {
   valueNumbering.clear();
   expressionNumbering.clear();
+  NumberingPhi.clear();
+  PhiTranslateTable.clear();
   nextValueNumber = 1;
+  Expressions.clear();
+  ExprIdx.clear();
+  nextExprNumber = 0;
 }
 
 /// Remove a value from the value numbering.
 void GVN::ValueTable::erase(Value *V) {
+  uint32_t Num = valueNumbering.lookup(V);
   valueNumbering.erase(V);
+  // If V is PHINode, V <--> value number is an one-to-one mapping.
+  if (isa<PHINode>(V))
+    NumberingPhi.erase(Num);
 }
 
 /// verifyRemoved - Verify that the value is removed from all internal data
@@ -602,7 +619,7 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) {
+LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
   errs() << "{\n";
   for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
        E = d.end(); I != E; ++I) {
@@ -1451,6 +1468,95 @@ bool GVN::processLoad(LoadInst *L) {
   return false;
 }
 
+/// Return a pair the first field showing the value number of \p Exp and the
+/// second field showing whether it is a value number newly created.
+std::pair<uint32_t, bool>
+GVN::ValueTable::assignExpNewValueNum(Expression &Exp) {
+  uint32_t &e = expressionNumbering[Exp];
+  bool CreateNewValNum = !e;
+  if (CreateNewValNum) {
+    Expressions.push_back(Exp);
+    if (ExprIdx.size() < nextValueNumber + 1)
+      ExprIdx.resize(nextValueNumber * 2);
+    e = nextValueNumber;
+    ExprIdx[nextValueNumber++] = nextExprNumber++;
+  }
+  return {e, CreateNewValNum};
+}
+
+/// Return whether all the values related with the same \p num are
+/// defined in \p BB.
+bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
+                                     GVN &Gvn) {
+  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
+  while (Vals && Vals->BB == BB)
+    Vals = Vals->Next;
+  return !Vals;
+}
+
+/// Wrap phiTranslateImpl to provide caching functionality.
+uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
+                                       const BasicBlock *PhiBlock, uint32_t Num,
+                                       GVN &Gvn) {
+  auto FindRes = PhiTranslateTable.find({Num, Pred});
+  if (FindRes != PhiTranslateTable.end())
+    return FindRes->second;
+  uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn);
+  PhiTranslateTable.insert({{Num, Pred}, NewNum});
+  return NewNum;
+}
+
+/// Translate value number \p Num using phis, so that it has the values of
+/// the phis in BB.
+uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
+                                           const BasicBlock *PhiBlock,
+                                           uint32_t Num, GVN &Gvn) {
+  if (PHINode *PN = NumberingPhi[Num]) {
+    for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+      if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred)
+        if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false))
+          return TransVal;
+    }
+    return Num;
+  }
+
+  // If there is any value related with Num is defined in a BB other than
+  // PhiBlock, it cannot depend on a phi in PhiBlock without going through
+  // a backedge. We can do an early exit in that case to save compile time.
+  if (!areAllValsInBB(Num, PhiBlock, Gvn))
+    return Num;
+
+  if (Num >= ExprIdx.size() || ExprIdx[Num] == 0)
+    return Num;
+  Expression Exp = Expressions[ExprIdx[Num]];
+
+  for (unsigned i = 0; i < Exp.varargs.size(); i++) {
+    // For InsertValue and ExtractValue, some varargs are index numbers
+    // instead of value numbers. Those index numbers should not be
+    // translated.
+    if ((i > 1 && Exp.opcode == Instruction::InsertValue) ||
+        (i > 0 && Exp.opcode == Instruction::ExtractValue))
+      continue;
+    Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn);
+  }
+
+  if (Exp.commutative) {
+    assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!");
+    if (Exp.varargs[0] > Exp.varargs[1]) {
+      std::swap(Exp.varargs[0], Exp.varargs[1]);
+      uint32_t Opcode = Exp.opcode >> 8;
+      if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)
+        Exp.opcode = (Opcode << 8) |
+                     CmpInst::getSwappedPredicate(
+                         static_cast<CmpInst::Predicate>(Exp.opcode & 255));
+    }
+  }
+
+  if (uint32_t NewNum = expressionNumbering[Exp])
+    return NewNum;
+  return Num;
+}
+
 // In order to find a leader for a given value number at a
 // specific basic block, we first obtain the list of all Values for that number,
 // and then scan the list to find one whose block dominates the block in
@@ -1495,6 +1601,15 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
   return Pred != nullptr;
 }
 
+
+void GVN::assignBlockRPONumber(Function &F) {
+  uint32_t NextBlockNumber = 1;
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (BasicBlock *BB : RPOT)
+    BlockRPONumber[BB] = NextBlockNumber++;
+}
+
+
 // Tries to replace instruction with const, using information from
 // ReplaceWithConstMap.
 bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
@@ -1856,6 +1971,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
     // Fabricate val-num for dead-code in order to suppress assertion in
     // performPRE().
     assignValNumForDeadCode();
+    assignBlockRPONumber(F);
     bool PREChanged = true;
     while (PREChanged) {
       PREChanged = performPRE(F);
@@ -1927,7 +2043,7 @@ bool GVN::processBlock(BasicBlock *BB) {
 
 // Instantiate an expression in a predecessor that lacked it.
 bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
-                                    unsigned int ValNo) {
+                                    BasicBlock *Curr, unsigned int ValNo) {
   // Because we are going top-down through the block, all value numbers
   // will be available in the predecessor by the time we need them.  Any
   // that weren't originally present will have been instantiated earlier
@@ -1945,7 +2061,9 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
       success = false;
       break;
     }
-    if (Value *V = findLeader(Pred, VN.lookup(Op))) {
+    uint32_t TValNo =
+        VN.phiTranslate(Pred, Curr, VN.lookup(Op), *this);
+    if (Value *V = findLeader(Pred, TValNo)) {
       Instr->setOperand(i, V);
     } else {
       success = false;
@@ -1962,10 +2080,12 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
   Instr->insertBefore(Pred->getTerminator());
   Instr->setName(Instr->getName() + ".pre");
   Instr->setDebugLoc(Instr->getDebugLoc());
-  VN.add(Instr, ValNo);
+
+  unsigned Num = VN.lookupOrAdd(Instr);
+  VN.add(Instr, Num);
 
   // Update the availability map to include the new instruction.
-  addToLeaderTable(ValNo, Instr, Pred);
+  addToLeaderTable(Num, Instr, Pred);
   return true;
 }
 
@@ -2003,18 +2123,27 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
 
   SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap;
   for (BasicBlock *P : predecessors(CurrentBlock)) {
-    // We're not interested in PRE where the block is its
-    // own predecessor, or in blocks with predecessors
-    // that are not reachable.
-    if (P == CurrentBlock) {
+    // We're not interested in PRE where blocks with predecessors that are
+    // not reachable.
+    if (!DT->isReachableFromEntry(P)) {
       NumWithout = 2;
       break;
-    } else if (!DT->isReachableFromEntry(P)) {
+    }
+    // It is not safe to do PRE when P->CurrentBlock is a loop backedge, and
+    // when CurInst has operand defined in CurrentBlock (so it may be defined
+    // by phi in the loop header).
+    if (BlockRPONumber[P] >= BlockRPONumber[CurrentBlock] &&
+        any_of(CurInst->operands(), [&](const Use &U) {
+          if (auto *Inst = dyn_cast<Instruction>(U.get()))
+            return Inst->getParent() == CurrentBlock;
+          return false;
+        })) {
       NumWithout = 2;
       break;
     }
 
-    Value *predV = findLeader(P, ValNo);
+    uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this);
+    Value *predV = findLeader(P, TValNo);
     if (!predV) {
       predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
       PREPred = P;
@@ -2054,7 +2183,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
     }
     // We need to insert somewhere, so let's give it a shot
     PREInstr = CurInst->clone();
-    if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) {
+    if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) {
       // If we failed insertion, make sure we remove the instruction.
       DEBUG(verifyRemoved(PREInstr));
       PREInstr->deleteValue();
@@ -2168,6 +2297,7 @@ bool GVN::iterateOnFunction(Function &F) {
 void GVN::cleanupGlobalSets() {
   VN.clear();
   LeaderTable.clear();
+  BlockRPONumber.clear();
   TableAllocator.Reset();
 }
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index c120036464d0a..05293eb0079fc 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
@@ -576,7 +577,12 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
   // Handle compare with phi operand, where the PHI is defined in this block.
   if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
     assert(Preference == WantInteger && "Compares only produce integers");
-    PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0));
+    Type *CmpType = Cmp->getType();
+    Value *CmpLHS = Cmp->getOperand(0);
+    Value *CmpRHS = Cmp->getOperand(1);
+    CmpInst::Predicate Pred = Cmp->getPredicate();
+
+    PHINode *PN = dyn_cast<PHINode>(CmpLHS);
     if (PN && PN->getParent() == BB) {
       const DataLayout &DL = PN->getModule()->getDataLayout();
       // We can do this simplification if any comparisons fold to true or false.
@@ -584,15 +590,15 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         BasicBlock *PredBB = PN->getIncomingBlock(i);
         Value *LHS = PN->getIncomingValue(i);
-        Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB);
+        Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB);
 
-        Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, {DL});
+        Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
         if (!Res) {
           if (!isa<Constant>(RHS))
             continue;
 
           LazyValueInfo::Tristate
-            ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS,
+            ResT = LVI->getPredicateOnEdge(Pred, LHS,
                                            cast<Constant>(RHS), PredBB, BB,
                                            CxtI ? CxtI : Cmp);
           if (ResT == LazyValueInfo::Unknown)
@@ -609,27 +615,67 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
 
     // If comparing a live-in value against a constant, see if we know the
     // live-in value on any predecessors.
-    if (isa<Constant>(Cmp->getOperand(1)) && !Cmp->getType()->isVectorTy()) {
-      Constant *CmpConst = cast<Constant>(Cmp->getOperand(1));
+    if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) {
+      Constant *CmpConst = cast<Constant>(CmpRHS);
 
-      if (!isa<Instruction>(Cmp->getOperand(0)) ||
-          cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) {
+      if (!isa<Instruction>(CmpLHS) ||
+          cast<Instruction>(CmpLHS)->getParent() != BB) {
         for (BasicBlock *P : predecessors(BB)) {
           // If the value is known by LazyValueInfo to be a constant in a
           // predecessor, use that information to try to thread this block.
           LazyValueInfo::Tristate Res =
-            LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
+            LVI->getPredicateOnEdge(Pred, CmpLHS,
                                     CmpConst, P, BB, CxtI ? CxtI : Cmp);
           if (Res == LazyValueInfo::Unknown)
             continue;
 
-          Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
+          Constant *ResC = ConstantInt::get(CmpType, Res);
           Result.push_back(std::make_pair(ResC, P));
         }
 
         return !Result.empty();
       }
 
+      // InstCombine can fold some forms of constant range checks into
+      // (icmp (add (x, C1)), C2). See if we have we have such a thing with
+      // x as a live-in.
+      {
+        using namespace PatternMatch;
+        Value *AddLHS;
+        ConstantInt *AddConst;
+        if (isa<ConstantInt>(CmpConst) &&
+            match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
+          if (!isa<Instruction>(AddLHS) ||
+              cast<Instruction>(AddLHS)->getParent() != BB) {
+            for (BasicBlock *P : predecessors(BB)) {
+              // If the value is known by LazyValueInfo to be a ConstantRange in
+              // a predecessor, use that information to try to thread this
+              // block.
+              ConstantRange CR = LVI->getConstantRangeOnEdge(
+                  AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS));
+              // Propagate the range through the addition.
+              CR = CR.add(AddConst->getValue());
+
+              // Get the range where the compare returns true.
+              ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(
+                  Pred, cast<ConstantInt>(CmpConst)->getValue());
+
+              Constant *ResC;
+              if (CmpRange.contains(CR))
+                ResC = ConstantInt::getTrue(CmpType);
+              else if (CmpRange.inverse().contains(CR))
+                ResC = ConstantInt::getFalse(CmpType);
+              else
+                continue;
+
+              Result.push_back(std::make_pair(ResC, P));
+            }
+
+            return !Result.empty();
+          }
+        }
+      }
+
       // Try to find a constant value for the LHS of a comparison,
       // and evaluate it statically if we can.
       PredValueInfoTy LHSVals;
@@ -638,8 +684,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
 
       for (const auto &LHSVal : LHSVals) {
         Constant *V = LHSVal.first;
-        Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(),
-                                                    V, CmpConst);
+        Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
         if (Constant *KC = getKnownConstant(Folded, WantInteger))
           Result.push_back(std::make_pair(KC, LHSVal.second));
       }
@@ -752,6 +797,37 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
       LVI->eraseBlock(SinglePred);
       MergeBasicBlockIntoOnlyPred(BB);
 
+      // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by
+      // BB code within one basic block `BB`), we need to invalidate the LVI
+      // information associated with BB, because the LVI information need not be
+      // true for all of BB after the merge. For example,
+      // Before the merge, LVI info and code is as follows:
+      // SinglePred: <LVI info1 for %p val>
+      // %y = use of %p
+      // call @exit() // need not transfer execution to successor.
+      // assume(%p) // from this point on %p is true
+      // br label %BB
+      // BB: <LVI info2 for %p val, i.e. %p is true>
+      // %x = use of %p
+      // br label exit
+      //
+      // Note that this LVI info for blocks BB and SinglPred is correct for %p
+      // (info2 and info1 respectively). After the merge and the deletion of the
+      // LVI info1 for SinglePred. We have the following code:
+      // BB: <LVI info2 for %p val>
+      // %y = use of %p
+      // call @exit()
+      // assume(%p)
+      // %x = use of %p <-- LVI info2 is correct from here onwards.
+      // br label exit
+      // LVI info2 for BB is incorrect at the beginning of BB.
+
+      // Invalidate LVI information for BB if the LVI is not provably true for
+      // all of BB.
+      if (any_of(*BB, [](Instruction &I) {
+            return !isGuaranteedToTransferExecutionToSuccessor(&I);
+          }))
+        LVI->eraseBlock(BB);
       return true;
     }
   }
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
deleted file mode 100644
index 025ba1bfedc18..0000000000000
--- a/lib/Transforms/Scalar/LoadCombine.cpp
+++ /dev/null
@@ -1,295 +0,0 @@
-//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This transformation combines adjacent loads.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/TargetFolder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "load-combine"
-
-STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining");
-STATISTIC(NumLoadsCombined, "Number of loads combined");
-
-#define LDCOMBINE_NAME "Combine Adjacent Loads"
-
-namespace {
-struct PointerOffsetPair {
-  Value *Pointer;
-  APInt Offset;
-};
-
-struct LoadPOPPair {
-  LoadInst *Load;
-  PointerOffsetPair POP;
-  /// \brief The new load needs to be created before the first load in IR order.
-  unsigned InsertOrder;
-};
-
-class LoadCombine : public BasicBlockPass {
-  LLVMContext *C;
-  AliasAnalysis *AA;
-  DominatorTree *DT;
-
-public:
-  LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) {
-    initializeLoadCombinePass(*PassRegistry::getPassRegistry());
-  }
-
-  using llvm::Pass::doInitialization;
-  bool doInitialization(Function &) override;
-  bool runOnBasicBlock(BasicBlock &BB) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-  }
-
-  StringRef getPassName() const override { return LDCOMBINE_NAME; }
-  static char ID;
-
-  typedef IRBuilder<TargetFolder> BuilderTy;
-
-private:
-  BuilderTy *Builder;
-
-  PointerOffsetPair getPointerOffsetPair(LoadInst &);
-  bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &);
-  bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &);
-  bool combineLoads(SmallVectorImpl<LoadPOPPair> &);
-};
-}
-
-bool LoadCombine::doInitialization(Function &F) {
-  DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n");
-  C = &F.getContext();
-  return true;
-}
-
-PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) {
-  auto &DL = LI.getModule()->getDataLayout();
-
-  PointerOffsetPair POP;
-  POP.Pointer = LI.getPointerOperand();
-  unsigned BitWidth = DL.getPointerSizeInBits(LI.getPointerAddressSpace());
-  POP.Offset = APInt(BitWidth, 0);
-
-  while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) {
-    if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) {
-      APInt LastOffset = POP.Offset;
-      if (!GEP->accumulateConstantOffset(DL, POP.Offset)) {
-        // Can't handle GEPs with variable indices.
-        POP.Offset = LastOffset;
-        return POP;
-      }
-      POP.Pointer = GEP->getPointerOperand();
-    } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) {
-      POP.Pointer = BC->getOperand(0);
-    }
-  }
-  return POP;
-}
-
-bool LoadCombine::combineLoads(
-    DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) {
-  bool Combined = false;
-  for (auto &Loads : LoadMap) {
-    if (Loads.second.size() < 2)
-      continue;
-    std::sort(Loads.second.begin(), Loads.second.end(),
-              [](const LoadPOPPair &A, const LoadPOPPair &B) {
-                return A.POP.Offset.slt(B.POP.Offset);
-              });
-    if (aggregateLoads(Loads.second))
-      Combined = true;
-  }
-  return Combined;
-}
-
-/// \brief Try to aggregate loads from a sorted list of loads to be combined.
-///
-/// It is guaranteed that no writes occur between any of the loads. All loads
-/// have the same base pointer. There are at least two loads.
-bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
-  assert(Loads.size() >= 2 && "Insufficient loads!");
-  LoadInst *BaseLoad = nullptr;
-  SmallVector<LoadPOPPair, 8> AggregateLoads;
-  bool Combined = false;
-  bool ValidPrevOffset = false;
-  APInt PrevOffset;
-  uint64_t PrevSize = 0;
-  for (auto &L : Loads) {
-    if (ValidPrevOffset == false) {
-      BaseLoad = L.Load;
-      PrevOffset = L.POP.Offset;
-      PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
-          L.Load->getType());
-      AggregateLoads.push_back(L);
-      ValidPrevOffset = true;
-      continue;
-    }
-    if (L.Load->getAlignment() > BaseLoad->getAlignment())
-      continue;
-    APInt PrevEnd = PrevOffset + PrevSize;
-    if (L.POP.Offset.sgt(PrevEnd)) {
-      // No other load will be combinable
-      if (combineLoads(AggregateLoads))
-        Combined = true;
-      AggregateLoads.clear();
-      ValidPrevOffset = false;
-      continue;
-    }
-    if (L.POP.Offset != PrevEnd)
-      // This load is offset less than the size of the last load.
-      // FIXME: We may want to handle this case.
-      continue;
-    PrevOffset = L.POP.Offset;
-    PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
-        L.Load->getType());
-    AggregateLoads.push_back(L);
-  }
-  if (combineLoads(AggregateLoads))
-    Combined = true;
-  return Combined;
-}
-
-/// \brief Given a list of combinable load. Combine the maximum number of them.
-bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
-  // Remove loads from the end while the size is not a power of 2.
-  unsigned TotalSize = 0;
-  for (const auto &L : Loads)
-    TotalSize += L.Load->getType()->getPrimitiveSizeInBits();
-  while (TotalSize != 0 && !isPowerOf2_32(TotalSize))
-    TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits();
-  if (Loads.size() < 2)
-    return false;
-
-  DEBUG({
-    dbgs() << "***** Combining Loads ******\n";
-    for (const auto &L : Loads) {
-      dbgs() << L.POP.Offset << ": " << *L.Load << "\n";
-    }
-  });
-
-  // Find first load. This is where we put the new load.
-  LoadPOPPair FirstLP;
-  FirstLP.InsertOrder = -1u;
-  for (const auto &L : Loads)
-    if (L.InsertOrder < FirstLP.InsertOrder)
-      FirstLP = L;
-
-  unsigned AddressSpace =
-      FirstLP.POP.Pointer->getType()->getPointerAddressSpace();
-
-  Builder->SetInsertPoint(FirstLP.Load);
-  Value *Ptr = Builder->CreateConstGEP1_64(
-      Builder->CreatePointerCast(Loads[0].POP.Pointer,
-                                 Builder->getInt8PtrTy(AddressSpace)),
-      Loads[0].POP.Offset.getSExtValue());
-  LoadInst *NewLoad = new LoadInst(
-      Builder->CreatePointerCast(
-          Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize),
-                                Ptr->getType()->getPointerAddressSpace())),
-      Twine(Loads[0].Load->getName()) + ".combined", false,
-      Loads[0].Load->getAlignment(), FirstLP.Load);
-
-  for (const auto &L : Loads) {
-    Builder->SetInsertPoint(L.Load);
-    Value *V = Builder->CreateExtractInteger(
-        L.Load->getModule()->getDataLayout(), NewLoad,
-        cast<IntegerType>(L.Load->getType()),
-        (L.POP.Offset - Loads[0].POP.Offset).getZExtValue(), "combine.extract");
-    L.Load->replaceAllUsesWith(V);
-  }
-
-  NumLoadsCombined += Loads.size();
-  return true;
-}
-
-bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
-  if (skipBasicBlock(BB))
-    return false;
-
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-  // Skip analysing dead blocks (not forward reachable from function entry).
-  if (!DT->isReachableFromEntry(&BB)) {
-    DEBUG(dbgs() << "LC: skipping unreachable " << BB.getName() <<
-          " in " << BB.getParent()->getName() << "\n");
-    return false;
-  }
-
-  IRBuilder<TargetFolder> TheBuilder(
-      BB.getContext(), TargetFolder(BB.getModule()->getDataLayout()));
-  Builder = &TheBuilder;
-
-  DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap;
-  AliasSetTracker AST(*AA);
-
-  bool Combined = false;
-  unsigned Index = 0;
-  for (auto &I : BB) {
-    if (I.mayThrow() || AST.containsUnknown(&I)) {
-      if (combineLoads(LoadMap))
-        Combined = true;
-      LoadMap.clear();
-      AST.clear();
-      continue;
-    }
-    if (I.mayWriteToMemory()) {
-      AST.add(&I);
-      continue;
-    }
-    LoadInst *LI = dyn_cast<LoadInst>(&I);
-    if (!LI)
-      continue;
-    ++NumLoadsAnalyzed;
-    if (!LI->isSimple() || !LI->getType()->isIntegerTy())
-      continue;
-    auto POP = getPointerOffsetPair(*LI);
-    if (!POP.Pointer)
-      continue;
-    LoadMap[POP.Pointer].push_back({LI, std::move(POP), Index++});
-    AST.add(LI);
-  }
-  if (combineLoads(LoadMap))
-    Combined = true;
-  return Combined;
-}
-
-char LoadCombine::ID = 0;
-
-BasicBlockPass *llvm::createLoadCombinePass() {
-  return new LoadCombine();
-}
-
-INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false)
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 3151ccd279c41..c41cc42db5e2c 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -31,20 +31,19 @@ using namespace llvm;
 STATISTIC(NumDeleted, "Number of loops deleted");
 
 /// This function deletes dead loops. The caller of this function needs to
-/// guarantee that the loop is infact dead.  Here we handle two kinds of dead
+/// guarantee that the loop is infact dead. Here we handle two kinds of dead
 /// loop. The first kind (\p isLoopDead) is where only invariant values from
 /// within the loop are used outside of it. The second kind (\p
 /// isLoopNeverExecuted) is where the loop is provably never executed. We can
-/// always remove never executed loops since they will not cause any
-/// difference to program behaviour.
+/// always remove never executed loops since they will not cause any difference
+/// to program behaviour.
 /// 
 /// This also updates the relevant analysis information in \p DT, \p SE, and \p
 /// LI. It also updates the loop PM if an updater struct is provided.
 // TODO: This function will be used by loop-simplifyCFG as well. So, move this
 // to LoopUtils.cpp
 static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
-                           LoopInfo &LI, bool LoopIsNeverExecuted,
-                           LPMUpdater *Updater = nullptr);
+                           LoopInfo &LI, LPMUpdater *Updater = nullptr);
 /// Determines if a loop is dead.
 ///
 /// This assumes that we've already checked for unique exit and exiting blocks,
@@ -168,7 +167,14 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   BasicBlock *ExitBlock = L->getUniqueExitBlock();
 
   if (ExitBlock && isLoopNeverExecuted(L)) {
-    deleteDeadLoop(L, DT, SE, LI, true /* LoopIsNeverExecuted */, Updater);
+    // Set incoming value to undef for phi nodes in the exit block.
+    BasicBlock::iterator BI = ExitBlock->begin();
+    while (PHINode *P = dyn_cast<PHINode>(BI)) {
+      for (unsigned i = 0; i < P->getNumIncomingValues(); i++)
+        P->setIncomingValue(i, UndefValue::get(P->getType()));
+      BI++;
+    }
+    deleteDeadLoop(L, DT, SE, LI, Updater);
     ++NumDeleted;
     return true;
   }
@@ -196,15 +202,14 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   if (isa<SCEVCouldNotCompute>(S))
     return Changed;
 
-  deleteDeadLoop(L, DT, SE, LI, false /* LoopIsNeverExecuted */, Updater);
+  deleteDeadLoop(L, DT, SE, LI, Updater);
   ++NumDeleted;
 
   return true;
 }
 
 static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
-                           LoopInfo &LI, bool LoopIsNeverExecuted,
-                           LPMUpdater *Updater) {
+                           LoopInfo &LI, LPMUpdater *Updater) {
   assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
   auto *Preheader = L->getLoopPreheader();
   assert(Preheader && "Preheader should exist!");
@@ -227,6 +232,8 @@ static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   auto *ExitBlock = L->getUniqueExitBlock();
   assert(ExitBlock && "Should have a unique exit block!");
 
+  assert(L->hasDedicatedExits() && "Loop should have dedicated exits!");
+
   // Connect the preheader directly to the exit block.
   // Even when the loop is never executed, we cannot remove the edge from the
   // source block to the exit block. Consider the case where the unexecuted loop
@@ -236,20 +243,28 @@ static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   // non-loop, it will be deleted in a future iteration of loop deletion pass.
   Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), ExitBlock);
 
-  SmallVector<BasicBlock *, 4> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
   // Rewrite phis in the exit block to get their inputs from the Preheader
   // instead of the exiting block.
-  BasicBlock *ExitingBlock = ExitingBlocks[0];
   BasicBlock::iterator BI = ExitBlock->begin();
   while (PHINode *P = dyn_cast<PHINode>(BI)) {
-    int j = P->getBasicBlockIndex(ExitingBlock);
-    assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
-    if (LoopIsNeverExecuted)
-      P->setIncomingValue(j, UndefValue::get(P->getType()));
-    P->setIncomingBlock(j, Preheader);
-    for (unsigned i = 1; i < ExitingBlocks.size(); ++i)
-      P->removeIncomingValue(ExitingBlocks[i]);
+    // Set the zero'th element of Phi to be from the preheader and remove all
+    // other incoming values. Given the loop has dedicated exits, all other
+    // incoming values must be from the exiting blocks.
+    int PredIndex = 0;
+    P->setIncomingBlock(PredIndex, Preheader);
+    // Removes all incoming values from all other exiting blocks (including
+    // duplicate values from an exiting block).
+    // Nuke all entries except the zero'th entry which is the preheader entry.
+    // NOTE! We need to remove Incoming Values in the reverse order as done
+    // below, to keep the indices valid for deletion (removeIncomingValues
+    // updates getNumIncomingValues and shifts all values down into the operand
+    // being deleted).
+    for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i)
+      P->removeIncomingValue(e-i, false);
+
+    assert((P->getNumIncomingValues() == 1 &&
+            P->getIncomingBlock(PredIndex) == Preheader) &&
+           "Should have exactly one value and that's from the preheader!");
     ++BI;
   }
 
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index b027278b24f2e..73436f13c94e4 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -131,7 +131,7 @@ static cl::opt<bool> EnablePhiElim(
 
 // The flag adds instruction count to solutions cost comparision.
 static cl::opt<bool> InsnsCost(
-  "lsr-insns-cost", cl::Hidden, cl::init(true),
+  "lsr-insns-cost", cl::Hidden, cl::init(false),
   cl::desc("Add instruction count to a LSR cost model"));
 
 // Flag to choose how to narrow complex lsr solution
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index cbbd55512c9f5..7a7624f775429 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -1244,27 +1244,24 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
     // only do this for simple stores, we should expand to cover memcpys, etc.
     const auto *LastStore = createStoreExpression(SI, StoreRHS);
     const auto *LastCC = ExpressionToClass.lookup(LastStore);
-    // Basically, check if the congruence class the store is in is defined by a
-    // store that isn't us, and has the same value.  MemorySSA takes care of
-    // ensuring the store has the same memory state as us already.
-    // The RepStoredValue gets nulled if all the stores disappear in a class, so
-    // we don't need to check if the class contains a store besides us.
-    if (LastCC &&
-        LastCC->getStoredValue() == lookupOperandLeader(SI->getValueOperand()))
+    // We really want to check whether the expression we matched was a store. No
+    // easy way to do that. However, we can check that the class we found has a
+    // store, which, assuming the value numbering state is not corrupt, is
+    // sufficient, because we must also be equivalent to that store's expression
+    // for it to be in the same class as the load.
+    if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue())
       return LastStore;
-    deleteExpression(LastStore);
     // Also check if our value operand is defined by a load of the same memory
     // location, and the memory state is the same as it was then (otherwise, it
     // could have been overwritten later. See test32 in
     // transforms/DeadStoreElimination/simple.ll).
-    if (auto *LI =
-            dyn_cast<LoadInst>(lookupOperandLeader(SI->getValueOperand()))) {
+    if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue()))
       if ((lookupOperandLeader(LI->getPointerOperand()) ==
-           lookupOperandLeader(SI->getPointerOperand())) &&
+           LastStore->getOperand(0)) &&
           (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) ==
            StoreRHS))
-        return createStoreExpression(SI, StoreRHS);
-    }
+        return LastStore;
+    deleteExpression(LastStore);
   }
 
   // If the store is not equivalent to anything, value number it as a store that
@@ -2332,9 +2329,7 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
 // see if we know some constant value for it already.
 Value *NewGVN::findConditionEquivalence(Value *Cond) const {
   auto Result = lookupOperandLeader(Cond);
-  if (isa<Constant>(Result))
-    return Result;
-  return nullptr;
+  return isa<Constant>(Result) ? Result : nullptr;
 }
 
 // Process the outgoing edges of a block for reachability.
@@ -3014,14 +3009,27 @@ void NewGVN::verifyIterationSettled(Function &F) {
 // a no-longer valid StoreExpression.
 void NewGVN::verifyStoreExpressions() const {
 #ifndef NDEBUG
-  DenseSet<std::pair<const Value *, const Value *>> StoreExpressionSet;
+  // This is the only use of this, and it's not worth defining a complicated
+  // densemapinfo hash/equality function for it.
+  std::set<
+      std::pair<const Value *,
+                std::tuple<const Value *, const CongruenceClass *, Value *>>>
+      StoreExpressionSet;
   for (const auto &KV : ExpressionToClass) {
     if (auto *SE = dyn_cast<StoreExpression>(KV.first)) {
       // Make sure a version that will conflict with loads is not already there
-      auto Res =
-          StoreExpressionSet.insert({SE->getOperand(0), SE->getMemoryLeader()});
-      assert(Res.second &&
-             "Stored expression conflict exists in expression table");
+      auto Res = StoreExpressionSet.insert(
+          {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second,
+                                              SE->getStoredValue())});
+      bool Okay = Res.second;
+      // It's okay to have the same expression already in there if it is
+      // identical in nature.
+      // This can happen when the leader of the stored value changes over time.
+      if (!Okay)
+        Okay = (std::get<1>(Res.first->second) == KV.second) &&
+               (lookupOperandLeader(std::get<2>(Res.first->second)) ==
+                lookupOperandLeader(SE->getStoredValue()));
+      assert(Okay && "Stored expression conflict exists in expression table");
       auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst());
       assert(ValueExpr && ValueExpr->equals(*SE) &&
              "StoreExpression in ExpressionToClass is not latest "
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index a20890b22603e..6da551bd7efd6 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -106,11 +107,12 @@ XorOpnd::XorOpnd(Value *V) {
             I->getOpcode() == Instruction::And)) {
     Value *V0 = I->getOperand(0);
     Value *V1 = I->getOperand(1);
-    if (isa<ConstantInt>(V0))
+    const APInt *C;
+    if (match(V0, PatternMatch::m_APInt(C)))
       std::swap(V0, V1);
 
-    if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
-      ConstPart = C->getValue();
+    if (match(V1, PatternMatch::m_APInt(C))) {
+      ConstPart = *C;
       SymbolicPart = V0;
       isOr = (I->getOpcode() == Instruction::Or);
       return;
@@ -119,7 +121,7 @@ XorOpnd::XorOpnd(Value *V) {
 
   // view the operand as "V | 0"
   SymbolicPart = V;
-  ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
+  ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits());
   isOr = true;
 }
 
@@ -955,8 +957,8 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
 /// Scan backwards and forwards among values with the same rank as element i
 /// to see if X exists.  If X does not exist, return i.  This is useful when
 /// scanning for 'x' when we see '-x' because they both get the same rank.
-static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
-                                  Value *X) {
+static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops,
+                                  unsigned i, Value *X) {
   unsigned XRank = Ops[i].Rank;
   unsigned e = Ops.size();
   for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
@@ -1134,20 +1136,19 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
 /// instruction. There are two special cases: 1) if the constant operand is 0,
 /// it will return NULL. 2) if the constant is ~0, the symbolic operand will
 /// be returned.
-static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, 
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
                              const APInt &ConstOpnd) {
-  if (ConstOpnd != 0) {
-    if (!ConstOpnd.isAllOnesValue()) {
-      LLVMContext &Ctx = Opnd->getType()->getContext();
-      Instruction *I;
-      I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
-                                    "and.ra", InsertBefore);
-      I->setDebugLoc(InsertBefore->getDebugLoc());
-      return I;
-    }
+  if (ConstOpnd.isNullValue())
+    return nullptr;
+
+  if (ConstOpnd.isAllOnesValue())
     return Opnd;
-  }
-  return nullptr;
+
+  Instruction *I = BinaryOperator::CreateAnd(
+      Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra",
+      InsertBefore);
+  I->setDebugLoc(InsertBefore->getDebugLoc());
+  return I;
 }
 
 // Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
@@ -1163,24 +1164,24 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
   //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)
   //                       = (x & ~c1) ^ (c1 ^ c2)
   // It is useful only when c1 == c2.
-  if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
-    if (!Opnd1->getValue()->hasOneUse())
-      return false;
+  if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue())
+    return false;
 
-    const APInt &C1 = Opnd1->getConstPart();
-    if (C1 != ConstOpnd)
-      return false;
+  if (!Opnd1->getValue()->hasOneUse())
+    return false;
 
-    Value *X = Opnd1->getSymbolicPart();
-    Res = createAndInstr(I, X, ~C1);
-    // ConstOpnd was C2, now C1 ^ C2.
-    ConstOpnd ^= C1;
+  const APInt &C1 = Opnd1->getConstPart();
+  if (C1 != ConstOpnd)
+    return false;
 
-    if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
-      RedoInsts.insert(T);
-    return true;
-  }
-  return false;
+  Value *X = Opnd1->getSymbolicPart();
+  Res = createAndInstr(I, X, ~C1);
+  // ConstOpnd was C2, now C1 ^ C2.
+  ConstOpnd ^= C1;
+
+  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+    RedoInsts.insert(T);
+  return true;
 }
 
                            
@@ -1221,8 +1222,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
     APInt C3((~C1) ^ C2);
 
     // Do not increase code size!
-    if (C3 != 0 && !C3.isAllOnesValue()) {
-      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+    if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
       if (NewInstNum > DeadInstNum)
         return false;
     }
@@ -1238,8 +1239,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
     APInt C3 = C1 ^ C2;
     
     // Do not increase code size
-    if (C3 != 0 && !C3.isAllOnesValue()) {
-      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+    if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
       if (NewInstNum > DeadInstNum)
         return false;
     }
@@ -1279,17 +1280,20 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
   SmallVector<XorOpnd, 8> Opnds;
   SmallVector<XorOpnd*, 8> OpndPtrs;
   Type *Ty = Ops[0].Op->getType();
-  APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);
+  APInt ConstOpnd(Ty->getScalarSizeInBits(), 0);
 
   // Step 1: Convert ValueEntry to XorOpnd
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     Value *V = Ops[i].Op;
-    if (!isa<ConstantInt>(V)) {
+    const APInt *C;
+    // TODO: Support non-splat vectors.
+    if (match(V, PatternMatch::m_APInt(C))) {
+      ConstOpnd ^= *C;
+    } else {
       XorOpnd O(V);
       O.setSymbolicRank(getRank(O.getSymbolicPart()));
       Opnds.push_back(O);
-    } else
-      ConstOpnd ^= cast<ConstantInt>(V)->getValue();
+    }
   }
 
   // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
@@ -1327,7 +1331,8 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
     Value *CV;
 
     // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
-    if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+    if (!ConstOpnd.isNullValue() &&
+        CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
       Changed = true;
       if (CV)
         *CurrOpnd = XorOpnd(CV);
@@ -1369,17 +1374,17 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
       ValueEntry VE(getRank(O.getValue()), O.getValue());
       Ops.push_back(VE);
     }
-    if (ConstOpnd != 0) {
-      Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
+    if (!ConstOpnd.isNullValue()) {
+      Value *C = ConstantInt::get(Ty, ConstOpnd);
       ValueEntry VE(getRank(C), C);
       Ops.push_back(VE);
     }
-    int Sz = Ops.size();
+    unsigned Sz = Ops.size();
     if (Sz == 1)
       return Ops.back().Op;
-    else if (Sz == 0) {
-      assert(ConstOpnd == 0);
-      return ConstantInt::get(Ty->getContext(), ConstOpnd);
+    if (Sz == 0) {
+      assert(ConstOpnd.isNullValue());
+      return ConstantInt::get(Ty, ConstOpnd);
     }
   }
 
@@ -1627,8 +1632,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
 ///   ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
 ///
 /// \returns Whether any factors have a power greater than one.
-bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
-                                             SmallVectorImpl<Factor> &Factors) {
+static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                   SmallVectorImpl<Factor> &Factors) {
   // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
   // Compute the sum of powers of simplifiable factors.
   unsigned FactorPowerSum = 0;
@@ -1999,11 +2004,6 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
   if (I->isCommutative())
     canonicalizeOperands(I);
 
-  // TODO: We should optimize vector Xor instructions, but they are
-  // currently unsupported.
-  if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor)
-    return;
-
   // Don't optimize floating point instructions that don't have unsafe algebra.
   if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
     return;
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index c6929c33b3e9e..7a6fa1711411d 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -536,9 +536,10 @@ private:
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
   void visitInstruction(Instruction &I) {
-    // If a new instruction is added to LLVM that we don't handle.
+    // All the instructions we don't do any special handling for just
+    // go to overdefined.
     DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
-    markOverdefined(&I);   // Just in case
+    markOverdefined(&I);
   }
 };
 
@@ -1814,15 +1815,11 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
     if (F.isDeclaration())
       continue;
 
-    if (Solver.isBlockExecutable(&F.front())) {
+    if (Solver.isBlockExecutable(&F.front()))
       for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
-           ++AI) {
-        if (AI->use_empty())
-          continue;
-        if (tryToReplaceWithConstant(Solver, &*AI))
+           ++AI)
+        if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI))
           ++IPNumArgsElimed;
-      }
-    }
 
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
       if (!Solver.isBlockExecutable(&*BB)) {
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 1527f15f18a33..80fbbeb6829bb 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -1252,7 +1252,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
     if (!LI || !LI->isSimple())
       return false;
 
-    // Both operands to the select need to be dereferencable, either
+    // Both operands to the select need to be dereferenceable, either
     // absolutely (e.g. allocas) or at this point because we can see other
     // accesses to it.
     if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI))
@@ -1637,8 +1637,17 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
       return cast<PointerType>(NewTy)->getPointerAddressSpace() ==
         cast<PointerType>(OldTy)->getPointerAddressSpace();
     }
-    if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
-      return true;
+
+    // We can convert integers to integral pointers, but not to non-integral
+    // pointers.
+    if (OldTy->isIntegerTy())
+      return !DL.isNonIntegralPointerType(NewTy);
+
+    // We can convert integral pointers to integers, but non-integral pointers
+    // need to remain pointers.
+    if (!DL.isNonIntegralPointerType(OldTy))
+      return NewTy->isIntegerTy();
+
     return false;
   }
 
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 850a01114eeba..ce6f93eb0c15f 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -91,7 +91,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeSeparateConstOffsetFromGEPPass(Registry);
   initializeSpeculativeExecutionLegacyPassPass(Registry);
   initializeStraightLineStrengthReducePass(Registry);
-  initializeLoadCombinePass(Registry);
   initializePlaceBackedgeSafepointsImplPass(Registry);
   initializePlaceSafepointsPass(Registry);
   initializeFloat2IntLegacyPassPass(Registry);
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 3e5993618c4c0..9397b87cdf563 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -321,7 +321,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
 /// instruction from after the call to before the call, assuming that all
 /// instructions between the call and this instruction are movable.
 ///
-static bool canMoveAboveCall(Instruction *I, CallInst *CI) {
+static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
   // FIXME: We can move load/store/call/free instructions above the call if the
   // call does not mod/ref the memory location being processed.
   if (I->mayHaveSideEffects())  // This also handles volatile loads.
@@ -332,10 +332,10 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI) {
     if (CI->mayHaveSideEffects()) {
       // Non-volatile loads may be moved above a call with side effects if it
       // does not write to memory and the load provably won't trap.
-      // FIXME: Writes to memory only matter if they may alias the pointer
+      // Writes to memory only matter if they may alias the pointer
       // being loaded from.
       const DataLayout &DL = L->getModule()->getDataLayout();
-      if (CI->mayWriteToMemory() ||
+      if ((AA->getModRefInfo(CI, MemoryLocation::get(L)) & MRI_Mod) ||
           !isSafeToLoadUnconditionally(L->getPointerOperand(),
                                        L->getAlignment(), DL, L))
         return false;
@@ -492,10 +492,11 @@ static CallInst *findTRECandidate(Instruction *TI,
   return CI;
 }
 
-static bool
-eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry,
-                           bool &TailCallsAreMarkedTail,
-                           SmallVectorImpl<PHINode *> &ArgumentPHIs) {
+static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
+                                       BasicBlock *&OldEntry,
+                                       bool &TailCallsAreMarkedTail,
+                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                       AliasAnalysis *AA) {
   // If we are introducing accumulator recursion to eliminate operations after
   // the call instruction that are both associative and commutative, the initial
   // value for the accumulator is placed in this variable.  If this value is set
@@ -515,7 +516,8 @@ eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry,
   // Check that this is the case now.
   BasicBlock::iterator BBI(CI);
   for (++BBI; &*BBI != Ret; ++BBI) {
-    if (canMoveAboveCall(&*BBI, CI)) continue;
+    if (canMoveAboveCall(&*BBI, CI, AA))
+      continue;
 
     // If we can't move the instruction above the call, it might be because it
     // is an associative and commutative operation that could be transformed
@@ -674,12 +676,17 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
                                      bool &TailCallsAreMarkedTail,
                                      SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                      bool CannotTailCallElimCallsMarkedTail,
-                                     const TargetTransformInfo *TTI) {
+                                     const TargetTransformInfo *TTI,
+                                     AliasAnalysis *AA) {
   bool Change = false;
 
+  // Make sure this block is a trivial return block.
+  assert(BB->getFirstNonPHIOrDbg() == Ret &&
+         "Trying to fold non-trivial return block");
+
   // If the return block contains nothing but the return and PHI's,
   // there might be an opportunity to duplicate the return in its
-  // predecessors and perform TRC there. Look for predecessors that end
+  // predecessors and perform TRE there. Look for predecessors that end
   // in unconditional branch and recursive call(s).
   SmallVector<BranchInst*, 8> UncondBranchPreds;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
@@ -706,7 +713,7 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
         BB->eraseFromParent();
 
       eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
-                                 ArgumentPHIs);
+                                 ArgumentPHIs, AA);
       ++NumRetDuped;
       Change = true;
     }
@@ -719,16 +726,18 @@ static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
                                   bool &TailCallsAreMarkedTail,
                                   SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                   bool CannotTailCallElimCallsMarkedTail,
-                                  const TargetTransformInfo *TTI) {
+                                  const TargetTransformInfo *TTI,
+                                  AliasAnalysis *AA) {
   CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
   if (!CI)
     return false;
 
   return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
-                                    ArgumentPHIs);
+                                    ArgumentPHIs, AA);
 }
 
-static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI) {
+static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
+                                   AliasAnalysis *AA) {
   if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
     return false;
 
@@ -763,11 +772,11 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI)
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
       bool Change =
           processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
-                                ArgumentPHIs, !CanTRETailMarkedCall, TTI);
+                                ArgumentPHIs, !CanTRETailMarkedCall, TTI, AA);
       if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
-        Change =
-            foldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail,
-                                     ArgumentPHIs, !CanTRETailMarkedCall, TTI);
+        Change = foldReturnAndProcessPred(BB, Ret, OldEntry,
+                                          TailCallsAreMarkedTail, ArgumentPHIs,
+                                          !CanTRETailMarkedCall, TTI, AA);
       MadeChange |= Change;
     }
   }
@@ -797,6 +806,7 @@ struct TailCallElim : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 
@@ -805,7 +815,8 @@ struct TailCallElim : public FunctionPass {
       return false;
 
     return eliminateTailRecursion(
-        F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F));
+        F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
+        &getAnalysis<AAResultsWrapperPass>().getAAResults());
   }
 };
 }
@@ -826,8 +837,9 @@ PreservedAnalyses TailCallElimPass::run(Function &F,
                                         FunctionAnalysisManager &AM) {
 
   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
 
-  bool Changed = eliminateTailRecursion(F, &TTI);
+  bool Changed = eliminateTailRecursion(F, &TTI, &AA);
 
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index ebde1f9a17dd6..b60dfb4f3541d 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -116,6 +116,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_wcslen:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setDoesNotCapture(F, 0);
     return Changed;
   case LibFunc_strchr:
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 2af671636cbdb..5127eba3f9aea 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
@@ -1081,7 +1082,7 @@ static bool LdStHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr,
 }
 
 /// See if there is a dbg.value intrinsic for DIVar for the PHI node.
-static bool PhiHasDebugValue(DILocalVariable *DIVar, 
+static bool PhiHasDebugValue(DILocalVariable *DIVar,
                              DIExpression *DIExpr,
                              PHINode *APN) {
   // Since we can't guarantee that the original dbg.declare instrinsic
@@ -1159,7 +1160,7 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   DbgValue->insertAfter(LI);
 }
 
-/// Inserts a llvm.dbg.value intrinsic after a phi 
+/// Inserts a llvm.dbg.value intrinsic after a phi
 /// that has an associated llvm.dbg.decl intrinsic.
 void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            PHINode *APN, DIBuilder &Builder) {
@@ -1742,12 +1743,12 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
         // Preserve !invariant.group in K.
         break;
       case LLVMContext::MD_align:
-        K->setMetadata(Kind, 
+        K->setMetadata(Kind,
           MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
         break;
       case LLVMContext::MD_dereferenceable:
       case LLVMContext::MD_dereferenceable_or_null:
-        K->setMetadata(Kind, 
+        K->setMetadata(Kind,
           MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
         break;
     }
@@ -1847,6 +1848,49 @@ bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
   return false;
 }
 
+void llvm::copyNonnullMetadata(const LoadInst &OldLI, MDNode *N,
+                               LoadInst &NewLI) {
+  auto *NewTy = NewLI.getType();
+
+  // This only directly applies if the new type is also a pointer.
+  if (NewTy->isPointerTy()) {
+    NewLI.setMetadata(LLVMContext::MD_nonnull, N);
+    return;
+  }
+
+  // The only other translation we can do is to integral loads with !range
+  // metadata.
+  if (!NewTy->isIntegerTy())
+    return;
+
+  MDBuilder MDB(NewLI.getContext());
+  const Value *Ptr = OldLI.getPointerOperand();
+  auto *ITy = cast<IntegerType>(NewTy);
+  auto *NullInt = ConstantExpr::getPtrToInt(
+      ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy);
+  auto *NonNullInt = ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1));
+  NewLI.setMetadata(LLVMContext::MD_range,
+                    MDB.createRange(NonNullInt, NullInt));
+}
+
+void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
+                             MDNode *N, LoadInst &NewLI) {
+  auto *NewTy = NewLI.getType();
+
+  // Give up unless it is converted to a pointer where there is a single very
+  // valuable mapping we can do reliably.
+  // FIXME: It would be nice to propagate this in more ways, but the type
+  // conversions make it hard.
+  if (!NewTy->isPointerTy())
+    return;
+
+  unsigned BitWidth = DL.getTypeSizeInBits(NewTy);
+  if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
+    MDNode *NN = MDNode::get(OldLI.getContext(), None);
+    NewLI.setMetadata(LLVMContext::MD_nonnull, NN);
+  }
+}
+
 namespace {
 /// A potential constituent of a bitreverse or bswap expression. See
 /// collectBitParts for a fuller explanation.
@@ -1968,7 +2012,7 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
       unsigned NumMaskedBits = AndMask.countPopulation();
       if (!MatchBitReversals && NumMaskedBits % 8 != 0)
         return Result;
-      
+
       auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
                                   MatchBitReversals, BPS);
       if (!Res)
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index f3db278ef1e49..e21e34df8ded0 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -72,7 +72,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-simplify"
 
-STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");
 STATISTIC(NumNested  , "Number of nested loops split out");
 
 // If the block isn't already, move the new block to right after some 'outside
@@ -152,37 +151,6 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
   return PreheaderBB;
 }
 
-/// \brief Ensure that the loop preheader dominates all exit blocks.
-///
-/// This method is used to split exit blocks that have predecessors outside of
-/// the loop.
-static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit,
-                                        DominatorTree *DT, LoopInfo *LI,
-                                        bool PreserveLCSSA) {
-  SmallVector<BasicBlock*, 8> LoopBlocks;
-  for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {
-    BasicBlock *P = *I;
-    if (L->contains(P)) {
-      // Don't do this if the loop is exited via an indirect branch.
-      if (isa<IndirectBrInst>(P->getTerminator())) return nullptr;
-
-      LoopBlocks.push_back(P);
-    }
-  }
-
-  assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
-  BasicBlock *NewExitBB = nullptr;
-
-  NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", DT, LI,
-                                     PreserveLCSSA);
-  if (!NewExitBB)
-    return nullptr;
-
-  DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
-               << NewExitBB->getName() << "\n");
-  return NewExitBB;
-}
-
 /// Add the specified block, and all of its predecessors, to the specified set,
 /// if it's not already in there.  Stop predecessor traversal when we reach
 /// StopBlock.
@@ -346,16 +314,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
 
   // Split edges to exit blocks from the inner loop, if they emerged in the
   // process of separating the outer one.
-  SmallVector<BasicBlock *, 8> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-  SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
-                                               ExitBlocks.end());
-  for (BasicBlock *ExitBlock : ExitBlockSet) {
-    if (any_of(predecessors(ExitBlock),
-               [L](BasicBlock *BB) { return !L->contains(BB); })) {
-      rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA);
-    }
-  }
+  formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA);
 
   if (PreserveLCSSA) {
     // Fix LCSSA form for L. Some values, which previously were only used inside
@@ -563,29 +522,16 @@ ReprocessLoop:
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
     Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
-    if (Preheader) {
-      ++NumInserted;
+    if (Preheader)
       Changed = true;
-    }
   }
 
   // Next, check to make sure that all exit nodes of the loop only have
   // predecessors that are inside of the loop.  This check guarantees that the
   // loop preheader/header will dominate the exit blocks.  If the exit block has
   // predecessors from outside of the loop, split the edge now.
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-
-  SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
-                                               ExitBlocks.end());
-  for (BasicBlock *ExitBlock : ExitBlockSet) {
-    if (any_of(predecessors(ExitBlock),
-               [L](BasicBlock *BB) { return !L->contains(BB); })) {
-      rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA);
-      ++NumInserted;
-      Changed = true;
-    }
-  }
+  if (formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA))
+    Changed = true;
 
   // If the header has more than two predecessors at this point (from the
   // preheader and from multiple backedges), we must adjust the loop.
@@ -614,10 +560,8 @@ ReprocessLoop:
     // insert a new block that all backedges target, then make it jump to the
     // loop header.
     LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI);
-    if (LoopLatch) {
-      ++NumInserted;
+    if (LoopLatch)
       Changed = true;
-    }
   }
 
   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
@@ -645,7 +589,22 @@ ReprocessLoop:
   // loop-invariant instructions out of the way to open up more
   // opportunities, and the disadvantage of having the responsibility
   // to preserve dominator information.
-  if (ExitBlockSet.size() == 1) {
+  auto HasUniqueExitBlock = [&]() {
+    BasicBlock *UniqueExit = nullptr;
+    for (auto *ExitingBB : ExitingBlocks)
+      for (auto *SuccBB : successors(ExitingBB)) {
+        if (L->contains(SuccBB))
+          continue;
+
+        if (!UniqueExit)
+          UniqueExit = SuccBB;
+        else if (UniqueExit != SuccBB)
+          return false;
+      }
+
+    return true;
+  };
+  if (HasUniqueExitBlock()) {
     for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
       BasicBlock *ExitingBlock = ExitingBlocks[i];
       if (!ExitingBlock->getSinglePredecessor()) continue;
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index a920cd86a26a8..5f85e17927fa2 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -472,10 +472,22 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // exit block only.
   if (!L->isLoopSimplifyForm())
     return false;
-  BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop
-  if (!Exit)
-    return false;
 
+  // Guaranteed by LoopSimplifyForm.
+  BasicBlock *Latch = L->getLoopLatch();
+
+  BasicBlock *LatchExit = L->getUniqueExitBlock(); // successor out of loop
+  if (!LatchExit)
+    return false;
+  // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
+  // targets of the Latch be the single exit block out of the loop. This needs
+  // to be guaranteed by the callers of UnrollRuntimeLoopRemainder.
+  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  assert((LatchBR->getSuccessor(0) == LatchExit ||
+          LatchBR->getSuccessor(1) == LatchExit) &&
+         "one of the loop latch successors should be "
+         "the exit block!");
+  (void)LatchBR;
   // Use Scalar Evolution to compute the trip count. This allows more loops to
   // be unrolled than relying on induction var simplification.
   if (!SE)
@@ -510,25 +522,13 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   if (Log2_32(Count) > BEWidth)
     return false;
 
-  BasicBlock *Latch = L->getLoopLatch();
-
-  // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
-  // targets of the Latch be the single exit block out of the loop. This needs
-  // to be guaranteed by the callers of UnrollRuntimeLoopRemainder.
-  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
-  assert(
-      (LatchBR->getSuccessor(0) == Exit || LatchBR->getSuccessor(1) == Exit) &&
-      "one of the loop latch successors should be "
-      "the exit block!");
-  // Avoid warning of unused `LatchBR` variable in release builds.
-  (void)LatchBR;
   // Loop structure is the following:
   //
   // PreHeader
   //   Header
   //   ...
   //   Latch
-  // Exit
+  // LatchExit
 
   BasicBlock *NewPreHeader;
   BasicBlock *NewExit = nullptr;
@@ -541,9 +541,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
     // Split PreHeader to insert a branch around loop for unrolling.
     NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
     NewPreHeader->setName(PreHeader->getName() + ".new");
-    // Split Exit to create phi nodes from branch above.
-    SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
-    NewExit = SplitBlockPredecessors(Exit, Preds, ".unr-lcssa",
+    // Split LatchExit to create phi nodes from branch above.
+    SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit));
+    NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa",
                                      DT, LI, PreserveLCSSA);
     // Split NewExit to insert epilog remainder loop.
     EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI);
@@ -570,7 +570,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   //   Latch             Header
   // *NewExit            ...
   // *EpilogPreHeader    Latch
-  // Exit              Exit
+  // LatchExit              LatchExit
 
   // Calculate conditions for branch around loop for unrolling
   // in epilog case and around prolog remainder loop in prolog case.
@@ -648,7 +648,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // Clone all the basic blocks in the loop. If Count is 2, we don't clone
   // the loop, otherwise we create a cloned loop to execute the extra
   // iterations. This function adds the appropriate CFG connections.
-  BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit;
+  BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
   BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
   CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop,
                   InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
@@ -672,7 +672,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   //   EpilogHeader      Header
   //   ...               ...
   //   EpilogLatch       Latch
-  // Exit              Exit
+  // LatchExit              LatchExit
 
   // Rewrite the cloned instruction operands to use the values created when the
   // clone is created.
@@ -686,7 +686,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   if (UseEpilogRemainder) {
     // Connect the epilog code to the original loop and update the
     // PHI functions.
-    ConnectEpilog(L, ModVal, NewExit, Exit, PreHeader,
+    ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader,
                   EpilogPreHeader, NewPreHeader, VMap, DT, LI,
                   PreserveLCSSA);
 
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 412f6129407ed..0ed33945ef407 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
@@ -29,6 +30,7 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -87,8 +89,7 @@ RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT,
 
   // Matches either I & 2^x-1 or 2^x-1 & I. If we find a match, we update RT
   // with a new integer type of the corresponding bit width.
-  if (match(J, m_CombineOr(m_And(m_Instruction(I), m_APInt(M)),
-                           m_And(m_APInt(M), m_Instruction(I))))) {
+  if (match(J, m_c_And(m_Instruction(I), m_APInt(M)))) {
     int32_t Bits = (*M + 1).exactLogBase2();
     if (Bits > 0) {
       RT = IntegerType::get(Phi->getContext(), Bits);
@@ -923,6 +924,69 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
   return true;
 }
 
+bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
+                                   bool PreserveLCSSA) {
+  bool Changed = false;
+
+  // We re-use a vector for the in-loop predecesosrs.
+  SmallVector<BasicBlock *, 4> InLoopPredecessors;
+
+  auto RewriteExit = [&](BasicBlock *BB) {
+    assert(InLoopPredecessors.empty() &&
+           "Must start with an empty predecessors list!");
+    auto Cleanup = make_scope_exit([&] { InLoopPredecessors.clear(); });
+
+    // See if there are any non-loop predecessors of this exit block and
+    // keep track of the in-loop predecessors.
+    bool IsDedicatedExit = true;
+    for (auto *PredBB : predecessors(BB))
+      if (L->contains(PredBB)) {
+        if (isa<IndirectBrInst>(PredBB->getTerminator()))
+          // We cannot rewrite exiting edges from an indirectbr.
+          return false;
+
+        InLoopPredecessors.push_back(PredBB);
+      } else {
+        IsDedicatedExit = false;
+      }
+
+    assert(!InLoopPredecessors.empty() && "Must have *some* loop predecessor!");
+
+    // Nothing to do if this is already a dedicated exit.
+    if (IsDedicatedExit)
+      return false;
+
+    auto *NewExitBB = SplitBlockPredecessors(
+        BB, InLoopPredecessors, ".loopexit", DT, LI, PreserveLCSSA);
+
+    if (!NewExitBB)
+      DEBUG(dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
+                   << *L << "\n");
+    else
+      DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
+                   << NewExitBB->getName() << "\n");
+    return true;
+  };
+
+  // Walk the exit blocks directly rather than building up a data structure for
+  // them, but only visit each one once.
+  SmallPtrSet<BasicBlock *, 4> Visited;
+  for (auto *BB : L->blocks())
+    for (auto *SuccBB : successors(BB)) {
+      // We're looking for exit blocks so skip in-loop successors.
+      if (L->contains(SuccBB))
+        continue;
+
+      // Visit each exit block exactly once.
+      if (!Visited.insert(SuccBB).second)
+        continue;
+
+      Changed |= RewriteExit(SuccBB);
+    }
+
+  return Changed;
+}
+
 /// \brief Returns the instructions that use values defined in the loop.
 SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
   SmallVector<Instruction *, 8> UsedOutside;
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1abdb24848506..eac2867233bc0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5702,14 +5702,14 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
 
   // We should not collect Uniforms more than once per VF. Right now,
-  // this function is called from collectUniformsAndScalars(), which 
+  // this function is called from collectUniformsAndScalars(), which
   // already does this check. Collecting Uniforms for VF=1 does not make any
   // sense.
 
   assert(VF >= 2 && !Uniforms.count(VF) &&
          "This function should not be visited twice for the same VF");
 
-  // Visit the list of Uniforms. If we'll not find any uniform value, we'll 
+  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
   // not analyze again.  Uniforms.count(VF) will return 1.
   Uniforms[VF].clear();
 
@@ -5988,10 +5988,10 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
         continue;
 
       Value *Ptr = getPointerOperand(&I);
-      // We don't check wrapping here because we don't know yet if Ptr will be 
-      // part of a full group or a group with gaps. Checking wrapping for all 
+      // We don't check wrapping here because we don't know yet if Ptr will be
+      // part of a full group or a group with gaps. Checking wrapping for all
       // pointers (even those that end up in groups with no gaps) will be overly
-      // conservative. For full groups, wrapping should be ok since if we would 
+      // conservative. For full groups, wrapping should be ok since if we would
       // wrap around the address space we would do a memory access at nullptr
       // even without the transformation. The wrapping checks are therefore
       // deferred until after we've formed the interleaved groups.
@@ -6244,7 +6244,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
     Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
     if (LastMember) {
       Value *LastMemberPtr = getPointerOperand(LastMember);
-      if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, 
+      if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
                         /*ShouldCheckWrap=*/true)) {
         DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
                         "last group member potentially pointer-wrapping.\n");
@@ -6252,9 +6252,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
       }
     } else {
       // Case 3: A non-reversed interleaved load group with gaps: We need
-      // to execute at least one scalar epilogue iteration. This will ensure 
+      // to execute at least one scalar epilogue iteration. This will ensure
       // we don't speculatively access memory out-of-bounds. We only need
-      // to look for a member at index factor - 1, since every group must have 
+      // to look for a member at index factor - 1, since every group must have
       // a member at index zero.
       if (Group->isReverse()) {
         releaseGroup(Group);
@@ -7789,8 +7789,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Check the loop for a trip count threshold:
   // do not vectorize loops with a tiny trip count.
-  const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L);
-  if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) {
+  unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
+  bool HasExpectedTC = (ExpectedTC > 0);
+
+  if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
+    auto EstimatedTC = getLoopEstimatedTripCount(L);
+    if (EstimatedTC) {
+      ExpectedTC = *EstimatedTC;
+      HasExpectedTC = true;
+    }
+  }
+
+  if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
                  << "This loop is not worth vectorizing.");
     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
@@ -7822,18 +7832,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   bool OptForSize =
       Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
 
-  // Compute the weighted frequency of this loop being executed and see if it
-  // is less than 20% of the function entry baseline frequency. Note that we
-  // always have a canonical loop here because we think we *can* vectorize.
-  // FIXME: This is hidden behind a flag due to pervasive problems with
-  // exactly what block frequency models.
-  if (LoopVectorizeWithBlockFrequency) {
-    BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
-    if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-        LoopEntryFreq < ColdEntryFreq)
-      OptForSize = true;
-  }
-
   // Check the function attributes to see if implicit floats are allowed.
   // FIXME: This check doesn't seem possibly correct -- what if the loop is
   // an integer loop and the vector instructions selected are purely integer
@@ -8015,11 +8013,6 @@ bool LoopVectorizePass::runImpl(
   DB = &DB_;
   ORE = &ORE_;
 
-  // Compute some weights outside of the loop over the loops. Compute this
-  // using a BranchProbability to re-use its scaling math.
-  const BranchProbability ColdProb(1, 5); // 20%
-  ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
-
   // Don't attempt if
   // 1. the target claims to have no vector registers, and
   // 2. interleaving won't help ILP.
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d1349535f2982..b267230d31859 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3327,12 +3327,10 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
             if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
               BundleMember->Dependencies++;
               ScheduleData *DestBundle = UseSD->FirstInBundle;
-              if (!DestBundle->IsScheduled) {
+              if (!DestBundle->IsScheduled)
                 BundleMember->incrementUnscheduledDeps(1);
-              }
-              if (!DestBundle->hasValidDependencies()) {
+              if (!DestBundle->hasValidDependencies())
                 WorkList.push_back(DestBundle);
-              }
             }
           } else {
             // I'm not sure if this can ever happen. But we need to be safe.