vendor/llvm/llvm-trunk-r306956

author: Dimitry Andric <dim@FreeBSD.org> 2017-07-01 13:22:02 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2017-07-01 13:22:02 +0000
commit: 9df3605dea17e84f8183581f6103bd0c79e2a606 (patch)
tree: 70a2f36ce9eb9bb213603cd7f2f120af53fc176f /lib
parent: 08bbd35a80bf7765fe0d3043f9eb5a2f2786b649 (diff)
282 files changed, 8010 insertions, 6473 deletions
diff --git a/lib/Analysis/CFLAndersAliasAnalysis.cpp b/lib/Analysis/CFLAndersAliasAnalysis.cpp
index ddd5123d0eff..0de7ad98af46 100644
--- a/lib/Analysis/CFLAndersAliasAnalysis.cpp
+++ b/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -68,17 +68,6 @@ CFLAndersAAResult::CFLAndersAAResult(CFLAndersAAResult &&RHS)
     : AAResultBase(std::move(RHS)), TLI(RHS.TLI) {}
 CFLAndersAAResult::~CFLAndersAAResult() {}
 
-static const Function *parentFunctionOfValue(const Value *Val) {
-  if (auto *Inst = dyn_cast<Instruction>(Val)) {
-    auto *Bb = Inst->getParent();
-    return Bb->getParent();
-  }
-
-  if (auto *Arg = dyn_cast<Argument>(Val))
-    return Arg->getParent();
-  return nullptr;
-}
-
 namespace {
 
 enum class MatchState : uint8_t {
@@ -789,10 +778,10 @@ void CFLAndersAAResult::scan(const Function &Fn) {
   // resize and invalidating the reference returned by operator[]
   auto FunInfo = buildInfoFrom(Fn);
   Cache[&Fn] = std::move(FunInfo);
-  Handles.push_front(FunctionHandle(const_cast<Function *>(&Fn), this));
+  Handles.emplace_front(const_cast<Function *>(&Fn), this);
 }
 
-void CFLAndersAAResult::evict(const Function &Fn) { Cache.erase(&Fn); }
+void CFLAndersAAResult::evict(const Function *Fn) { Cache.erase(Fn); }
 
 const Optional<CFLAndersAAResult::FunctionInfo> &
 CFLAndersAAResult::ensureCached(const Function &Fn) {
diff --git a/lib/Analysis/CFLSteensAliasAnalysis.cpp b/lib/Analysis/CFLSteensAliasAnalysis.cpp
index 6e4263920e58..adbdd82012a3 100644
--- a/lib/Analysis/CFLSteensAliasAnalysis.cpp
+++ b/lib/Analysis/CFLSteensAliasAnalysis.cpp
@@ -88,19 +88,6 @@ const StratifiedIndex StratifiedLink::SetSentinel =
 //===----------------------------------------------------------------------===//
 
 /// Determines whether it would be pointless to add the given Value to our sets.
-static bool canSkipAddingToSets(Value *Val);
-
-static Optional<Function *> parentFunctionOfValue(Value *Val) {
-  if (auto *Inst = dyn_cast<Instruction>(Val)) {
-    auto *Bb = Inst->getParent();
-    return Bb->getParent();
-  }
-
-  if (auto *Arg = dyn_cast<Argument>(Val))
-    return Arg->getParent();
-  return None;
-}
-
 static bool canSkipAddingToSets(Value *Val) {
   // Constants can share instances, which may falsely unify multiple
   // sets, e.g. in
@@ -245,7 +232,7 @@ void CFLSteensAAResult::scan(Function *Fn) {
   auto FunInfo = buildSetsFrom(Fn);
   Cache[Fn] = std::move(FunInfo);
 
-  Handles.push_front(FunctionHandle(Fn, this));
+  Handles.emplace_front(Fn, this);
 }
 
 void CFLSteensAAResult::evict(Function *Fn) { Cache.erase(Fn); }
@@ -281,9 +268,9 @@ AliasResult CFLSteensAAResult::query(const MemoryLocation &LocA,
     return NoAlias;
 
   Function *Fn = nullptr;
-  auto MaybeFnA = parentFunctionOfValue(ValA);
-  auto MaybeFnB = parentFunctionOfValue(ValB);
-  if (!MaybeFnA.hasValue() && !MaybeFnB.hasValue()) {
+  Function *MaybeFnA = const_cast<Function *>(parentFunctionOfValue(ValA));
+  Function *MaybeFnB = const_cast<Function *>(parentFunctionOfValue(ValB));
+  if (!MaybeFnA && !MaybeFnB) {
     // The only times this is known to happen are when globals + InlineAsm are
     // involved
     DEBUG(dbgs()
@@ -291,12 +278,12 @@ AliasResult CFLSteensAAResult::query(const MemoryLocation &LocA,
     return MayAlias;
   }
 
-  if (MaybeFnA.hasValue()) {
-    Fn = *MaybeFnA;
-    assert((!MaybeFnB.hasValue() || *MaybeFnB == *MaybeFnA) &&
+  if (MaybeFnA) {
+    Fn = MaybeFnA;
+    assert((!MaybeFnB || MaybeFnB == MaybeFnA) &&
            "Interprocedural queries not supported");
   } else {
-    Fn = *MaybeFnB;
+    Fn = MaybeFnB;
   }
 
   assert(Fn != nullptr);
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 77ad6f1e166f..35693666aa03 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -66,6 +66,12 @@ static cl::opt<int>
                          cl::ZeroOrMore,
                          cl::desc("Threshold for hot callsites "));
 
+static cl::opt<int> ColdCallSiteRelFreq(
+    "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
+    cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
+             "entry frequency, for a callsite to be cold in the absence of "
+             "profile information."));
+
 namespace {
 
 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
@@ -172,6 +178,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// Return true if size growth is allowed when inlining the callee at CS.
   bool allowSizeGrowth(CallSite CS);
 
+  /// Return true if \p CS is a cold callsite.
+  bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
+
   // Custom analysis routines.
   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
 
@@ -631,6 +640,26 @@ bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
   return true;
 }
 
+bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
+  // If global profile summary is available, then callsite's coldness is
+  // determined based on that.
+  if (PSI->hasProfileSummary())
+    return PSI->isColdCallSite(CS, CallerBFI);
+  if (!CallerBFI)
+    return false;
+
+  // In the absence of global profile summary, determine if the callsite is cold
+  // relative to caller's entry. We could potentially cache the computation of
+  // scaled entry frequency, but the added complexity is not worth it unless
+  // this scaling shows up high in the profiles.
+  const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
+  auto CallSiteBB = CS.getInstruction()->getParent();
+  auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
+  auto CallerEntryFreq =
+      CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
+  return CallSiteFreq < CallerEntryFreq * ColdProb;
+}
+
 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
   // If no size growth is allowed for this inlining, set Threshold to 0.
   if (!allowSizeGrowth(CS)) {
@@ -676,7 +705,7 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
         if (PSI->isHotCallSite(CS, CallerBFI)) {
           DEBUG(dbgs() << "Hot callsite.\n");
           Threshold = Params.HotCallSiteThreshold.getValue();
-        } else if (PSI->isColdCallSite(CS, CallerBFI)) {
+        } else if (isColdCallSite(CS, CallerBFI)) {
           DEBUG(dbgs() << "Cold callsite.\n");
           Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
         }
@@ -1010,7 +1039,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
     if (isa<ConstantInt>(V))
       return true;
 
-  // Assume the most general case where the swith is lowered into
+  // Assume the most general case where the switch is lowered into
   // either a jump table, bit test, or a balanced binary tree consisting of
   // case clusters without merging adjacent clusters with the same
   // destination. We do not consider the switches that are lowered with a mix
diff --git a/lib/Analysis/IteratedDominanceFrontier.cpp b/lib/Analysis/IteratedDominanceFrontier.cpp
index 2a736ec0379c..0e02850df349 100644
--- a/lib/Analysis/IteratedDominanceFrontier.cpp
+++ b/lib/Analysis/IteratedDominanceFrontier.cpp
@@ -20,14 +20,6 @@ namespace llvm {
 template <class NodeTy>
 void IDFCalculator<NodeTy>::calculate(
     SmallVectorImpl<BasicBlock *> &PHIBlocks) {
-  // If we haven't computed dominator tree levels, do so now.
-  if (DomLevels.empty()) {
-    for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode());
-         DFI != DFE; ++DFI) {
-      DomLevels[*DFI] = DFI.getPathLength() - 1;
-    }
-  }
-
   // Use a priority queue keyed on dominator tree level so that inserted nodes
   // are handled from the bottom of the dominator tree upwards.
   typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair;
@@ -37,7 +29,7 @@ void IDFCalculator<NodeTy>::calculate(
 
   for (BasicBlock *BB : *DefBlocks) {
     if (DomTreeNode *Node = DT.getNode(BB))
-      PQ.push(std::make_pair(Node, DomLevels.lookup(Node)));
+      PQ.push({Node, Node->getLevel()});
   }
 
   SmallVector<DomTreeNode *, 32> Worklist;
@@ -72,7 +64,7 @@ void IDFCalculator<NodeTy>::calculate(
         if (SuccNode->getIDom() == Node)
           continue;
 
-        unsigned SuccLevel = DomLevels.lookup(SuccNode);
+        const unsigned SuccLevel = SuccNode->getLevel();
         if (SuccLevel > RootLevel)
           continue;
 
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 7983d62c2f7a..f88d54b21e1e 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -400,8 +400,8 @@ static APInt getSizeWithOverflow(const SizeOffsetType &Data) {
 
 /// \brief Compute the size of the object pointed by Ptr. Returns true and the
 /// object size in Size if successful, and false otherwise.
-/// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
-/// byval arguments, and global variables.
+/// If RoundToAlign is true, then Size is rounded up to the alignment of
+/// allocas, byval arguments, and global variables.
 bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL,
                          const TargetLibraryInfo *TLI, ObjectSizeOpts Opts) {
   ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(), Opts);
diff --git a/lib/Analysis/OptimizationDiagnosticInfo.cpp b/lib/Analysis/OptimizationDiagnosticInfo.cpp
index e38e530c052d..eb259fd7a384 100644
--- a/lib/Analysis/OptimizationDiagnosticInfo.cpp
+++ b/lib/Analysis/OptimizationDiagnosticInfo.cpp
@@ -25,7 +25,7 @@ using namespace llvm;
 
 OptimizationRemarkEmitter::OptimizationRemarkEmitter(const Function *F)
     : F(F), BFI(nullptr) {
-  if (!F->getContext().getDiagnosticHotnessRequested())
+  if (!F->getContext().getDiagnosticsHotnessRequested())
     return;
 
   // First create a dominator tree.
@@ -155,6 +155,13 @@ void OptimizationRemarkEmitter::emit(
     DiagnosticInfoOptimizationBase &OptDiagBase) {
   auto &OptDiag = cast<DiagnosticInfoIROptimization>(OptDiagBase);
   computeHotness(OptDiag);
+  // If a diagnostic has a hotness value, then only emit it if its hotness
+  // meets the threshold.
+  if (OptDiag.getHotness() &&
+      *OptDiag.getHotness() <
+          F->getContext().getDiagnosticsHotnessThreshold()) {
+    return;
+  }
 
   yaml::Output *Out = F->getContext().getDiagnosticsOutputFile();
   if (Out) {
@@ -176,7 +183,7 @@ OptimizationRemarkEmitterWrapperPass::OptimizationRemarkEmitterWrapperPass()
 bool OptimizationRemarkEmitterWrapperPass::runOnFunction(Function &Fn) {
   BlockFrequencyInfo *BFI;
 
-  if (Fn.getContext().getDiagnosticHotnessRequested())
+  if (Fn.getContext().getDiagnosticsHotnessRequested())
     BFI = &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
   else
     BFI = nullptr;
@@ -198,7 +205,7 @@ OptimizationRemarkEmitterAnalysis::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   BlockFrequencyInfo *BFI;
 
-  if (F.getContext().getDiagnosticHotnessRequested())
+  if (F.getContext().getDiagnosticsHotnessRequested())
     BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
   else
     BFI = nullptr;
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index 63ef8d28d44a..900487323005 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -10,28 +10,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/RegionInfo.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/RegionInfoImpl.h"
-#include "llvm/Analysis/RegionIterator.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #ifndef NDEBUG
 #include "llvm/Analysis/RegionPrinter.h"
 #endif
+#include "llvm/Analysis/RegionInfoImpl.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "region"
 
 namespace llvm {
+
 template class RegionBase<RegionTraits<Function>>;
 template class RegionNodeBase<RegionTraits<Function>>;
 template class RegionInfoBase<RegionTraits<Function>>;
-}
+
+} // end namespace llvm
 
 STATISTIC(numRegions,       "The # of regions");
 STATISTIC(numSimpleRegions, "The # of simple regions");
@@ -44,7 +45,6 @@ VerifyRegionInfoX(
   cl::location(RegionInfoBase<RegionTraits<Function>>::VerifyRegionInfo),
   cl::desc("Verify region info (time consuming)"));
 
-
 static cl::opt<Region::PrintStyle, true> printStyleX("print-region-style",
   cl::location(RegionInfo::printStyle),
   cl::Hidden,
@@ -56,7 +56,6 @@ static cl::opt<Region::PrintStyle, true> printStyleX("print-region-style",
     clEnumValN(Region::PrintRN, "rn",
                "print regions in detail with element_iterator")));
 
-
 //===----------------------------------------------------------------------===//
 // Region implementation
 //
@@ -68,20 +67,15 @@ Region::Region(BasicBlock *Entry, BasicBlock *Exit,
 
 }
 
-Region::~Region() { }
+Region::~Region() = default;
 
 //===----------------------------------------------------------------------===//
 // RegionInfo implementation
 //
 
-RegionInfo::RegionInfo() :
-  RegionInfoBase<RegionTraits<Function>>() {
-
-}
+RegionInfo::RegionInfo() = default;
 
-RegionInfo::~RegionInfo() {
-
-}
+RegionInfo::~RegionInfo() = default;
 
 bool RegionInfo::invalidate(Function &F, const PreservedAnalyses &PA,
                             FunctionAnalysisManager::Invalidator &) {
@@ -126,9 +120,7 @@ RegionInfoPass::RegionInfoPass() : FunctionPass(ID) {
   initializeRegionInfoPassPass(*PassRegistry::getPassRegistry());
 }
 
-RegionInfoPass::~RegionInfoPass() {
-
-}
+RegionInfoPass::~RegionInfoPass() = default;
 
 bool RegionInfoPass::runOnFunction(Function &F) {
   releaseMemory();
@@ -181,10 +173,12 @@ INITIALIZE_PASS_END(RegionInfoPass, "regions",
 // the link time optimization.
 
 namespace llvm {
+
   FunctionPass *createRegionInfoPass() {
     return new RegionInfoPass();
   }
-}
+
+} // end namespace llvm
 
 //===----------------------------------------------------------------------===//
 // RegionInfoAnalysis implementation
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 73a95ec405c7..678ad3af5e85 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -157,6 +157,11 @@ static cl::opt<unsigned> MaxConstantEvolvingDepth(
     "scalar-evolution-max-constant-evolving-depth", cl::Hidden,
     cl::desc("Maximum depth of recursive constant evolving"), cl::init(32));
 
+static cl::opt<unsigned>
+    MaxExtDepth("scalar-evolution-max-ext-depth", cl::Hidden,
+                cl::desc("Maximum depth of recursive SExt/ZExt"),
+                cl::init(8));
+
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
 //===----------------------------------------------------------------------===//
@@ -1285,8 +1290,8 @@ static const SCEV *getUnsignedOverflowLimitForStep(const SCEV *Step,
 namespace {
 
 struct ExtendOpTraitsBase {
-  typedef const SCEV *(ScalarEvolution::*GetExtendExprTy)(
-      const SCEV *, Type *, ScalarEvolution::ExtendCacheTy &Cache);
+  typedef const SCEV *(ScalarEvolution::*GetExtendExprTy)(const SCEV *, Type *,
+                                                          unsigned);
 };
 
 // Used to make code generic over signed and unsigned overflow.
@@ -1315,9 +1320,8 @@ struct ExtendOpTraits<SCEVSignExtendExpr> : public ExtendOpTraitsBase {
   }
 };
 
-const ExtendOpTraitsBase::GetExtendExprTy
-    ExtendOpTraits<SCEVSignExtendExpr>::GetExtendExpr =
-        &ScalarEvolution::getSignExtendExprCached;
+const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits<
+    SCEVSignExtendExpr>::GetExtendExpr = &ScalarEvolution::getSignExtendExpr;
 
 template <>
 struct ExtendOpTraits<SCEVZeroExtendExpr> : public ExtendOpTraitsBase {
@@ -1332,9 +1336,8 @@ struct ExtendOpTraits<SCEVZeroExtendExpr> : public ExtendOpTraitsBase {
   }
 };
 
-const ExtendOpTraitsBase::GetExtendExprTy
-    ExtendOpTraits<SCEVZeroExtendExpr>::GetExtendExpr =
-        &ScalarEvolution::getZeroExtendExprCached;
+const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits<
+    SCEVZeroExtendExpr>::GetExtendExpr = &ScalarEvolution::getZeroExtendExpr;
 }
 
 // The recurrence AR has been shown to have no signed/unsigned wrap or something
@@ -1346,8 +1349,7 @@ const ExtendOpTraitsBase::GetExtendExprTy
 // "sext/zext(PostIncAR)"
 template <typename ExtendOpTy>
 static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty,
-                                        ScalarEvolution *SE,
-                                        ScalarEvolution::ExtendCacheTy &Cache) {
+                                        ScalarEvolution *SE, unsigned Depth) {
   auto WrapType = ExtendOpTraits<ExtendOpTy>::WrapType;
   auto GetExtendExpr = ExtendOpTraits<ExtendOpTy>::GetExtendExpr;
 
@@ -1394,9 +1396,9 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty,
   unsigned BitWidth = SE->getTypeSizeInBits(AR->getType());
   Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2);
   const SCEV *OperandExtendedStart =
-      SE->getAddExpr((SE->*GetExtendExpr)(PreStart, WideTy, Cache),
-                     (SE->*GetExtendExpr)(Step, WideTy, Cache));
-  if ((SE->*GetExtendExpr)(Start, WideTy, Cache) == OperandExtendedStart) {
+      SE->getAddExpr((SE->*GetExtendExpr)(PreStart, WideTy, Depth),
+                     (SE->*GetExtendExpr)(Step, WideTy, Depth));
+  if ((SE->*GetExtendExpr)(Start, WideTy, Depth) == OperandExtendedStart) {
     if (PreAR && AR->getNoWrapFlags(WrapType)) {
       // If we know `AR` == {`PreStart`+`Step`,+,`Step`} is `WrapType` (FlagNSW
       // or FlagNUW) and that `PreStart` + `Step` is `WrapType` too, then
@@ -1422,16 +1424,16 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty,
 template <typename ExtendOpTy>
 static const SCEV *getExtendAddRecStart(const SCEVAddRecExpr *AR, Type *Ty,
                                         ScalarEvolution *SE,
-                                        ScalarEvolution::ExtendCacheTy &Cache) {
+                                        unsigned Depth) {
   auto GetExtendExpr = ExtendOpTraits<ExtendOpTy>::GetExtendExpr;
 
-  const SCEV *PreStart = getPreStartForExtend<ExtendOpTy>(AR, Ty, SE, Cache);
+  const SCEV *PreStart = getPreStartForExtend<ExtendOpTy>(AR, Ty, SE, Depth);
   if (!PreStart)
-    return (SE->*GetExtendExpr)(AR->getStart(), Ty, Cache);
+    return (SE->*GetExtendExpr)(AR->getStart(), Ty, Depth);
 
-  return SE->getAddExpr(
-      (SE->*GetExtendExpr)(AR->getStepRecurrence(*SE), Ty, Cache),
-      (SE->*GetExtendExpr)(PreStart, Ty, Cache));
+  return SE->getAddExpr((SE->*GetExtendExpr)(AR->getStepRecurrence(*SE), Ty,
+                                             Depth),
+                        (SE->*GetExtendExpr)(PreStart, Ty, Depth));
 }
 
 // Try to prove away overflow by looking at "nearby" add recurrences.  A
@@ -1511,31 +1513,8 @@ bool ScalarEvolution::proveNoWrapByVaryingStart(const SCEV *Start,
   return false;
 }
 
-const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty) {
-  // Use the local cache to prevent exponential behavior of
-  // getZeroExtendExprImpl.
-  ExtendCacheTy Cache;
-  return getZeroExtendExprCached(Op, Ty, Cache);
-}
-
-/// Query \p Cache before calling getZeroExtendExprImpl. If there is no
-/// related entry in the \p Cache, call getZeroExtendExprImpl and save
-/// the result in the \p Cache.
-const SCEV *ScalarEvolution::getZeroExtendExprCached(const SCEV *Op, Type *Ty,
-                                                     ExtendCacheTy &Cache) {
-  auto It = Cache.find({Op, Ty});
-  if (It != Cache.end())
-    return It->second;
-  const SCEV *ZExt = getZeroExtendExprImpl(Op, Ty, Cache);
-  auto InsertResult = Cache.insert({{Op, Ty}, ZExt});
-  assert(InsertResult.second && "Expect the key was not in the cache");
-  (void)InsertResult;
-  return ZExt;
-}
-
-/// The real implementation of getZeroExtendExpr.
-const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
-                                                   ExtendCacheTy &Cache) {
+const SCEV *
+ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
          "This is not an extending conversion!");
   assert(isSCEVable(Ty) &&
@@ -1545,11 +1524,11 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
   // Fold if the operand is constant.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
     return getConstant(
-        cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(), Ty)));
+      cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(), Ty)));
 
   // zext(zext(x)) --> zext(x)
   if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
-    return getZeroExtendExprCached(SZ->getOperand(), Ty, Cache);
+    return getZeroExtendExpr(SZ->getOperand(), Ty, Depth + 1);
 
   // Before doing any expensive analysis, check to see if we've already
   // computed a SCEV for this Op and Ty.
@@ -1559,6 +1538,12 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
   ID.AddPointer(Ty);
   void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+  if (Depth > MaxExtDepth) {
+    SCEV *S = new (SCEVAllocator) SCEVZeroExtendExpr(ID.Intern(SCEVAllocator),
+                                                     Op, Ty);
+    UniqueSCEVs.InsertNode(S, IP);
+    return S;
+  }
 
   // zext(trunc(x)) --> zext(x) or x or trunc(x)
   if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) {
@@ -1593,8 +1578,8 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
       // we don't need to do any further analysis.
       if (AR->hasNoUnsignedWrap())
         return getAddRecExpr(
-            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache),
-            getZeroExtendExprCached(Step, Ty, Cache), L, AR->getNoWrapFlags());
+            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1),
+            getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
 
       // Check whether the backedge-taken count is SCEVCouldNotCompute.
       // Note that this serves two purposes: It filters out loops that are
@@ -1618,22 +1603,29 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
         if (MaxBECount == RecastedMaxBECount) {
           Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no unsigned overflow.
-          const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step);
-          const SCEV *ZAdd =
-              getZeroExtendExprCached(getAddExpr(Start, ZMul), WideTy, Cache);
-          const SCEV *WideStart = getZeroExtendExprCached(Start, WideTy, Cache);
+          const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step,
+                                        SCEV::FlagAnyWrap, Depth + 1);
+          const SCEV *ZAdd = getZeroExtendExpr(getAddExpr(Start, ZMul,
+                                                          SCEV::FlagAnyWrap,
+                                                          Depth + 1),
+                                               WideTy, Depth + 1);
+          const SCEV *WideStart = getZeroExtendExpr(Start, WideTy, Depth + 1);
           const SCEV *WideMaxBECount =
-              getZeroExtendExprCached(CastedMaxBECount, WideTy, Cache);
-          const SCEV *OperandExtendedAdd = getAddExpr(
-              WideStart, getMulExpr(WideMaxBECount, getZeroExtendExprCached(
-                                                        Step, WideTy, Cache)));
+            getZeroExtendExpr(CastedMaxBECount, WideTy, Depth + 1);
+          const SCEV *OperandExtendedAdd =
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
+                                  getZeroExtendExpr(Step, WideTy, Depth + 1),
+                                  SCEV::FlagAnyWrap, Depth + 1),
+                       SCEV::FlagAnyWrap, Depth + 1);
           if (ZAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NUW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache),
-                getZeroExtendExprCached(Step, Ty, Cache), L,
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getZeroExtendExpr(Step, Ty, Depth + 1), L,
                 AR->getNoWrapFlags());
           }
           // Similar to above, only this time treat the step value as signed.
@@ -1641,15 +1633,19 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
           OperandExtendedAdd =
             getAddExpr(WideStart,
                        getMulExpr(WideMaxBECount,
-                                  getSignExtendExpr(Step, WideTy)));
+                                  getSignExtendExpr(Step, WideTy, Depth + 1),
+                                  SCEV::FlagAnyWrap, Depth + 1),
+                       SCEV::FlagAnyWrap, Depth + 1);
           if (ZAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NW, which is propagated to this AddRec.
             // Negative step causes unsigned wrap, but it still can't self-wrap.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache),
-                getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getSignExtendExpr(Step, Ty, Depth + 1), L,
+                AR->getNoWrapFlags());
           }
         }
       }
@@ -1680,8 +1676,9 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache),
-                getZeroExtendExprCached(Step, Ty, Cache), L,
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getZeroExtendExpr(Step, Ty, Depth + 1), L,
                 AR->getNoWrapFlags());
           }
         } else if (isKnownNegative(Step)) {
@@ -1697,8 +1694,10 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache),
-                getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getSignExtendExpr(Step, Ty, Depth + 1), L,
+                AR->getNoWrapFlags());
           }
         }
       }
@@ -1706,8 +1705,8 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
       if (proveNoWrapByVaryingStart<SCEVZeroExtendExpr>(Start, Step, L)) {
         const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
         return getAddRecExpr(
-            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Cache),
-            getZeroExtendExprCached(Step, Ty, Cache), L, AR->getNoWrapFlags());
+            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1),
+            getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
       }
     }
 
@@ -1718,8 +1717,8 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
       // commute the zero extension with the addition operation.
       SmallVector<const SCEV *, 4> Ops;
       for (const auto *Op : SA->operands())
-        Ops.push_back(getZeroExtendExprCached(Op, Ty, Cache));
-      return getAddExpr(Ops, SCEV::FlagNUW);
+        Ops.push_back(getZeroExtendExpr(Op, Ty, Depth + 1));
+      return getAddExpr(Ops, SCEV::FlagNUW, Depth + 1);
     }
   }
 
@@ -1732,31 +1731,8 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
   return S;
 }
 
-const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty) {
-  // Use the local cache to prevent exponential behavior of
-  // getSignExtendExprImpl.
-  ExtendCacheTy Cache;
-  return getSignExtendExprCached(Op, Ty, Cache);
-}
-
-/// Query \p Cache before calling getSignExtendExprImpl. If there is no
-/// related entry in the \p Cache, call getSignExtendExprImpl and save
-/// the result in the \p Cache.
-const SCEV *ScalarEvolution::getSignExtendExprCached(const SCEV *Op, Type *Ty,
-                                                     ExtendCacheTy &Cache) {
-  auto It = Cache.find({Op, Ty});
-  if (It != Cache.end())
-    return It->second;
-  const SCEV *SExt = getSignExtendExprImpl(Op, Ty, Cache);
-  auto InsertResult = Cache.insert({{Op, Ty}, SExt});
-  assert(InsertResult.second && "Expect the key was not in the cache");
-  (void)InsertResult;
-  return SExt;
-}
-
-/// The real implementation of getSignExtendExpr.
-const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
-                                                   ExtendCacheTy &Cache) {
+const SCEV *
+ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
          "This is not an extending conversion!");
   assert(isSCEVable(Ty) &&
@@ -1766,15 +1742,15 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
   // Fold if the operand is constant.
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
     return getConstant(
-        cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(), Ty)));
+      cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(), Ty)));
 
   // sext(sext(x)) --> sext(x)
   if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
-    return getSignExtendExprCached(SS->getOperand(), Ty, Cache);
+    return getSignExtendExpr(SS->getOperand(), Ty, Depth + 1);
 
   // sext(zext(x)) --> zext(x)
   if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
-    return getZeroExtendExpr(SZ->getOperand(), Ty);
+    return getZeroExtendExpr(SZ->getOperand(), Ty, Depth + 1);
 
   // Before doing any expensive analysis, check to see if we've already
   // computed a SCEV for this Op and Ty.
@@ -1784,6 +1760,13 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
   ID.AddPointer(Ty);
   void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+  // Limit recursion depth.
+  if (Depth > MaxExtDepth) {
+    SCEV *S = new (SCEVAllocator) SCEVSignExtendExpr(ID.Intern(SCEVAllocator),
+                                                     Op, Ty);
+    UniqueSCEVs.InsertNode(S, IP);
+    return S;
+  }
 
   // sext(trunc(x)) --> sext(x) or x or trunc(x)
   if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) {
@@ -1809,8 +1792,9 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
           const APInt &C2 = SC2->getAPInt();
           if (C1.isStrictlyPositive() && C2.isStrictlyPositive() &&
               C2.ugt(C1) && C2.isPowerOf2())
-            return getAddExpr(getSignExtendExprCached(SC1, Ty, Cache),
-                              getSignExtendExprCached(SMul, Ty, Cache));
+            return getAddExpr(getSignExtendExpr(SC1, Ty, Depth + 1),
+                              getSignExtendExpr(SMul, Ty, Depth + 1),
+                              SCEV::FlagAnyWrap, Depth + 1);
         }
       }
     }
@@ -1821,8 +1805,8 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
       // commute the sign extension with the addition operation.
       SmallVector<const SCEV *, 4> Ops;
       for (const auto *Op : SA->operands())
-        Ops.push_back(getSignExtendExprCached(Op, Ty, Cache));
-      return getAddExpr(Ops, SCEV::FlagNSW);
+        Ops.push_back(getSignExtendExpr(Op, Ty, Depth + 1));
+      return getAddExpr(Ops, SCEV::FlagNSW, Depth + 1);
     }
   }
   // If the input value is a chrec scev, and we can prove that the value
@@ -1845,8 +1829,8 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
       // we don't need to do any further analysis.
       if (AR->hasNoSignedWrap())
         return getAddRecExpr(
-            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Cache),
-            getSignExtendExprCached(Step, Ty, Cache), L, SCEV::FlagNSW);
+            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
+            getSignExtendExpr(Step, Ty, Depth + 1), L, SCEV::FlagNSW);
 
       // Check whether the backedge-taken count is SCEVCouldNotCompute.
       // Note that this serves two purposes: It filters out loops that are
@@ -1870,22 +1854,29 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
         if (MaxBECount == RecastedMaxBECount) {
           Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no signed overflow.
-          const SCEV *SMul = getMulExpr(CastedMaxBECount, Step);
-          const SCEV *SAdd =
-              getSignExtendExprCached(getAddExpr(Start, SMul), WideTy, Cache);
-          const SCEV *WideStart = getSignExtendExprCached(Start, WideTy, Cache);
+          const SCEV *SMul = getMulExpr(CastedMaxBECount, Step,
+                                        SCEV::FlagAnyWrap, Depth + 1);
+          const SCEV *SAdd = getSignExtendExpr(getAddExpr(Start, SMul,
+                                                          SCEV::FlagAnyWrap,
+                                                          Depth + 1),
+                                               WideTy, Depth + 1);
+          const SCEV *WideStart = getSignExtendExpr(Start, WideTy, Depth + 1);
           const SCEV *WideMaxBECount =
-              getZeroExtendExpr(CastedMaxBECount, WideTy);
-          const SCEV *OperandExtendedAdd = getAddExpr(
-              WideStart, getMulExpr(WideMaxBECount, getSignExtendExprCached(
-                                                        Step, WideTy, Cache)));
+            getZeroExtendExpr(CastedMaxBECount, WideTy, Depth + 1);
+          const SCEV *OperandExtendedAdd =
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
+                                  getSignExtendExpr(Step, WideTy, Depth + 1),
+                                  SCEV::FlagAnyWrap, Depth + 1),
+                       SCEV::FlagAnyWrap, Depth + 1);
           if (SAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Cache),
-                getSignExtendExprCached(Step, Ty, Cache), L,
+                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getSignExtendExpr(Step, Ty, Depth + 1), L,
                 AR->getNoWrapFlags());
           }
           // Similar to above, only this time treat the step value as unsigned.
@@ -1893,7 +1884,9 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
           OperandExtendedAdd =
             getAddExpr(WideStart,
                        getMulExpr(WideMaxBECount,
-                                  getZeroExtendExpr(Step, WideTy)));
+                                  getZeroExtendExpr(Step, WideTy, Depth + 1),
+                                  SCEV::FlagAnyWrap, Depth + 1),
+                       SCEV::FlagAnyWrap, Depth + 1);
           if (SAdd == OperandExtendedAdd) {
             // If AR wraps around then
             //
@@ -1907,8 +1900,10 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
 
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Cache),
-                getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getZeroExtendExpr(Step, Ty, Depth + 1), L,
+                AR->getNoWrapFlags());
           }
         }
       }
@@ -1939,9 +1934,8 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
           // Cache knowledge of AR NSW, then propagate NSW to the wide AddRec.
           const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
           return getAddRecExpr(
-              getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Cache),
-              getSignExtendExprCached(Step, Ty, Cache), L,
-              AR->getNoWrapFlags());
+              getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
+              getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
         }
       }
 
@@ -1955,25 +1949,26 @@ const SCEV *ScalarEvolution::getSignExtendExprImpl(const SCEV *Op, Type *Ty,
         const APInt &C2 = SC2->getAPInt();
         if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) &&
             C2.isPowerOf2()) {
-          Start = getSignExtendExprCached(Start, Ty, Cache);
+          Start = getSignExtendExpr(Start, Ty, Depth + 1);
           const SCEV *NewAR = getAddRecExpr(getZero(AR->getType()), Step, L,
                                             AR->getNoWrapFlags());
-          return getAddExpr(Start, getSignExtendExprCached(NewAR, Ty, Cache));
+          return getAddExpr(Start, getSignExtendExpr(NewAR, Ty, Depth + 1),
+                            SCEV::FlagAnyWrap, Depth + 1);
         }
       }
 
       if (proveNoWrapByVaryingStart<SCEVSignExtendExpr>(Start, Step, L)) {
         const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
         return getAddRecExpr(
-            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Cache),
-            getSignExtendExprCached(Step, Ty, Cache), L, AR->getNoWrapFlags());
+            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
+            getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
       }
     }
 
   // If the input value is provably positive and we could not simplify
   // away the sext build a zext instead.
   if (isKnownNonNegative(Op))
-    return getZeroExtendExpr(Op, Ty);
+    return getZeroExtendExpr(Op, Ty, Depth + 1);
 
   // The cast wasn't folded; create an explicit cast node.
   // Recompute the insert position, as it may have been invalidated.
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 92328f6e5efd..f938a9a52065 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -89,8 +89,9 @@ TargetTransformInfo::getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
   return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize);
 }
 
-int TargetTransformInfo::getUserCost(const User *U) const {
-  int Cost = TTIImpl->getUserCost(U);
+int TargetTransformInfo::getUserCost(const User *U,
+    ArrayRef<const Value *> Operands) const {
+  int Cost = TTIImpl->getUserCost(U, Operands);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -116,8 +117,8 @@ bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
 }
 
 void TargetTransformInfo::getUnrollingPreferences(
-    Loop *L, UnrollingPreferences &UP) const {
-  return TTIImpl->getUnrollingPreferences(L, UP);
+    Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
+  return TTIImpl->getUnrollingPreferences(L, SE, UP);
 }
 
 bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index cd9972ab56a6..86c528de267a 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -23,10 +23,10 @@
 //
 // The scalar TBAA metadata format is very simple. TBAA MDNodes have up to
 // three fields, e.g.:
-//   !0 = metadata !{ metadata !"an example type tree" }
-//   !1 = metadata !{ metadata !"int", metadata !0 }
-//   !2 = metadata !{ metadata !"float", metadata !0 }
-//   !3 = metadata !{ metadata !"const float", metadata !2, i64 1 }
+//   !0 = !{ !"an example type tree" }
+//   !1 = !{ !"int", !0 }
+//   !2 = !{ !"float", !0 }
+//   !3 = !{ !"const float", !2, i64 1 }
 //
 // The first field is an identity field. It can be any value, usually
 // an MDString, which uniquely identifies the type. The most important
@@ -74,13 +74,13 @@
 // instruction. The base type is !4 (struct B), the access type is !2 (scalar
 // type short) and the offset is 4.
 //
-// !0 = metadata !{metadata !"Simple C/C++ TBAA"}
-// !1 = metadata !{metadata !"omnipotent char", metadata !0} // Scalar type node
-// !2 = metadata !{metadata !"short", metadata !1}           // Scalar type node
-// !3 = metadata !{metadata !"A", metadata !2, i64 0}        // Struct type node
-// !4 = metadata !{metadata !"B", metadata !2, i64 0, metadata !3, i64 4}
+// !0 = !{!"Simple C/C++ TBAA"}
+// !1 = !{!"omnipotent char", !0} // Scalar type node
+// !2 = !{!"short", !1}           // Scalar type node
+// !3 = !{!"A", !2, i64 0}        // Struct type node
+// !4 = !{!"B", !2, i64 0, !3, i64 4}
 //                                                           // Struct type node
-// !5 = metadata !{metadata !4, metadata !2, i64 4}          // Path tag node
+// !5 = !{!4, !2, i64 4}          // Path tag node
 //
 // The struct type nodes and the scalar type nodes form a type DAG.
 //         Root (!0)
diff --git a/lib/BinaryFormat/Magic.cpp b/lib/BinaryFormat/Magic.cpp
index f24f22c88a8a..b19a07a9066b 100644
--- a/lib/BinaryFormat/Magic.cpp
+++ b/lib/BinaryFormat/Magic.cpp
@@ -191,8 +191,8 @@ file_magic llvm::identify_magic(StringRef Magic) {
     }
     break;
 
-  case 0x64: // x86-64 Windows.
-    if (Magic[1] == char(0x86))
+  case 0x64: // x86-64 or ARM64 Windows.
+    if (Magic[1] == char(0x86) || Magic[1] == char(0xaa))
       return file_magic::coff_object;
     break;
 
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 0629c2d326ae..1ebef3173135 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -5360,8 +5360,9 @@ const std::error_category &llvm::BitcodeErrorCategory() {
   return *ErrorCategory;
 }
 
-static Expected<StringRef> readStrtab(BitstreamCursor &Stream) {
-  if (Stream.EnterSubBlock(bitc::STRTAB_BLOCK_ID))
+static Expected<StringRef> readBlobInRecord(BitstreamCursor &Stream,
+                                            unsigned Block, unsigned RecordID) {
+  if (Stream.EnterSubBlock(Block))
     return error("Invalid record");
 
   StringRef Strtab;
@@ -5382,7 +5383,7 @@ static Expected<StringRef> readStrtab(BitstreamCursor &Stream) {
     case BitstreamEntry::Record:
       StringRef Blob;
       SmallVector<uint64_t, 1> Record;
-      if (Stream.readRecord(Entry.ID, Record, &Blob) == bitc::STRTAB_BLOB)
+      if (Stream.readRecord(Entry.ID, Record, &Blob) == RecordID)
         Strtab = Blob;
       break;
     }
@@ -5450,7 +5451,8 @@ llvm::getBitcodeFileContents(MemoryBufferRef Buffer) {
       }
 
       if (Entry.ID == bitc::STRTAB_BLOCK_ID) {
-        Expected<StringRef> Strtab = readStrtab(Stream);
+        Expected<StringRef> Strtab =
+            readBlobInRecord(Stream, bitc::STRTAB_BLOCK_ID, bitc::STRTAB_BLOB);
         if (!Strtab)
           return Strtab.takeError();
         // This string table is used by every preceding bitcode module that does
@@ -5462,6 +5464,28 @@ llvm::getBitcodeFileContents(MemoryBufferRef Buffer) {
             break;
           I->Strtab = *Strtab;
         }
+        // Similarly, the string table is used by every preceding symbol table;
+        // normally there will be just one unless the bitcode file was created
+        // by binary concatenation.
+        if (!F.Symtab.empty() && F.StrtabForSymtab.empty())
+          F.StrtabForSymtab = *Strtab;
+        continue;
+      }
+
+      if (Entry.ID == bitc::SYMTAB_BLOCK_ID) {
+        Expected<StringRef> SymtabOrErr =
+            readBlobInRecord(Stream, bitc::SYMTAB_BLOCK_ID, bitc::SYMTAB_BLOB);
+        if (!SymtabOrErr)
+          return SymtabOrErr.takeError();
+
+        // We can expect the bitcode file to have multiple symbol tables if it
+        // was created by binary concatenation. In that case we silently
+        // ignore any subsequent symbol tables, which is fine because this is a
+        // low level function. The client is expected to notice that the number
+        // of modules in the symbol table does not match the number of modules
+        // in the input file and regenerate the symbol table.
+        if (F.Symtab.empty())
+          F.Symtab = *SymtabOrErr;
         continue;
       }
 
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index feeba31908ae..b2b1ea6de374 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -29,10 +29,12 @@
 #include "llvm/IR/UseListOrder.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Object/IRSymtab.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/SHA1.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 #include <map>
@@ -3820,6 +3822,38 @@ void BitcodeWriter::writeBlob(unsigned Block, unsigned Record, StringRef Blob) {
   Stream->ExitBlock();
 }
 
+void BitcodeWriter::writeSymtab() {
+  assert(!WroteStrtab && !WroteSymtab);
+
+  // If any module has module-level inline asm, we will require a registered asm
+  // parser for the target so that we can create an accurate symbol table for
+  // the module.
+  for (Module *M : Mods) {
+    if (M->getModuleInlineAsm().empty())
+      continue;
+
+    std::string Err;
+    const Triple TT(M->getTargetTriple());
+    const Target *T = TargetRegistry::lookupTarget(TT.str(), Err);
+    if (!T || !T->hasMCAsmParser())
+      return;
+  }
+
+  WroteSymtab = true;
+  SmallVector<char, 0> Symtab;
+  // The irsymtab::build function may be unable to create a symbol table if the
+  // module is malformed (e.g. it contains an invalid alias). Writing a symbol
+  // table is not required for correctness, but we still want to be able to
+  // write malformed modules to bitcode files, so swallow the error.
+  if (Error E = irsymtab::build(Mods, Symtab, StrtabBuilder, Alloc)) {
+    consumeError(std::move(E));
+    return;
+  }
+
+  writeBlob(bitc::SYMTAB_BLOCK_ID, bitc::SYMTAB_BLOB,
+            {Symtab.data(), Symtab.size()});
+}
+
 void BitcodeWriter::writeStrtab() {
   assert(!WroteStrtab);
 
@@ -3843,6 +3877,15 @@ void BitcodeWriter::writeModule(const Module *M,
                                 bool ShouldPreserveUseListOrder,
                                 const ModuleSummaryIndex *Index,
                                 bool GenerateHash, ModuleHash *ModHash) {
+  assert(!WroteStrtab);
+
+  // The Mods vector is used by irsymtab::build, which requires non-const
+  // Modules in case it needs to materialize metadata. But the bitcode writer
+  // requires that the module is materialized, so we can cast to non-const here,
+  // after checking that it is in fact materialized.
+  assert(M->isMaterialized());
+  Mods.push_back(const_cast<Module *>(M));
+
   ModuleBitcodeWriter ModuleWriter(M, Buffer, StrtabBuilder, *Stream,
                                    ShouldPreserveUseListOrder, Index,
                                    GenerateHash, ModHash);
@@ -3875,6 +3918,7 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
   BitcodeWriter Writer(Buffer);
   Writer.writeModule(M, ShouldPreserveUseListOrder, Index, GenerateHash,
                      ModHash);
+  Writer.writeSymtab();
   Writer.writeStrtab();
 
   if (TT.isOSDarwin() || TT.isOSBinFormatMachO())
diff --git a/lib/Bitcode/Writer/LLVMBuild.txt b/lib/Bitcode/Writer/LLVMBuild.txt
index a07c280fa9e3..ef6dc9f901e2 100644
--- a/lib/Bitcode/Writer/LLVMBuild.txt
+++ b/lib/Bitcode/Writer/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = BitWriter
 parent = Bitcode
-required_libraries = Analysis Core MC Support
+required_libraries = Analysis Core MC Object Support
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index c48fcaa7b0d1..ff427c9a0d75 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -631,7 +631,9 @@ void AsmPrinter::EmitFunctionHeader() {
   const Function *F = MF->getFunction();
 
   if (isVerbose())
-    OutStreamer->GetCommentOS() << "-- Begin function " << F->getName() << '\n';
+    OutStreamer->GetCommentOS()
+        << "-- Begin function "
+        << GlobalValue::dropLLVMManglingEscape(F->getName()) << '\n';
 
   // Print out constants referenced by the function
   EmitConstantPool();
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index e94616fd5900..a81d56e9618b 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -365,7 +365,7 @@ static void addLocIfNotPresent(SmallVectorImpl<const DILocation *> &Locs,
 void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL,
                                         const MachineFunction *MF) {
   // Skip this instruction if it has the same location as the previous one.
-  if (DL == CurFn->LastLoc)
+  if (!DL || DL == PrevInstLoc)
     return;
 
   const DIScope *Scope = DL.get()->getScope();
@@ -385,11 +385,11 @@ void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL,
   if (!CurFn->HaveLineInfo)
     CurFn->HaveLineInfo = true;
   unsigned FileId = 0;
-  if (CurFn->LastLoc.get() && CurFn->LastLoc->getFile() == DL->getFile())
+  if (PrevInstLoc.get() && PrevInstLoc->getFile() == DL->getFile())
     FileId = CurFn->LastFileId;
   else
     FileId = CurFn->LastFileId = maybeRecordFile(DL->getFile());
-  CurFn->LastLoc = DL;
+  PrevInstLoc = DL;
 
   unsigned FuncId = CurFn->FuncId;
   if (const DILocation *SiteLoc = DL->getInlinedAt()) {
@@ -2150,9 +2150,23 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
   if (!Asm || !CurFn || MI->isDebugValue() ||
       MI->getFlag(MachineInstr::FrameSetup))
     return;
+
+  // If the first instruction of a new MBB has no location, find the first
+  // instruction with a location and use that.
   DebugLoc DL = MI->getDebugLoc();
-  if (DL == PrevInstLoc || !DL)
+  if (!DL && MI->getParent() != PrevInstBB) {
+    for (const auto &NextMI : *MI->getParent()) {
+      DL = NextMI.getDebugLoc();
+      if (DL)
+        break;
+    }
+  }
+  PrevInstBB = MI->getParent();
+
+  // If we still don't have a debug location, don't record a location.
+  if (!DL)
     return;
+
   maybeRecordLocation(DL, Asm->MF);
 }
 
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index 2cd495aec6dc..fd8f60425c24 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -118,7 +118,6 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
     SmallVector<LocalVariable, 1> Locals;
 
-    DebugLoc LastLoc;
     const MCSymbol *Begin = nullptr;
     const MCSymbol *End = nullptr;
     unsigned FuncId = 0;
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index dc39d1e6cb52..d4a90eeabe15 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -245,17 +245,6 @@ void DwarfCompileUnit::addRange(RangeSpan Range) {
   CURanges.back().setEnd(Range.getEnd());
 }
 
-DIE::value_iterator
-DwarfCompileUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
-                                  const MCSymbol *Label, const MCSymbol *Sec) {
-  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-    return addLabel(Die, Attribute,
-                    DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
-                                               : dwarf::DW_FORM_data4,
-                    Label);
-  return addSectionDelta(Die, Attribute, Label, Sec);
-}
-
 void DwarfCompileUnit::initStmtList() {
   // Define start line table label for each Compile Unit.
   MCSymbol *LineTableStartSym =
@@ -380,15 +369,6 @@ void DwarfCompileUnit::constructScopeDIE(
   FinalChildren.push_back(std::move(ScopeDIE));
 }
 
-DIE::value_iterator
-DwarfCompileUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
-                                  const MCSymbol *Hi, const MCSymbol *Lo) {
-  return Die.addValue(DIEValueAllocator, Attribute,
-                      DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
-                                                 : dwarf::DW_FORM_data4,
-                      new (DIEValueAllocator) DIEDelta(Hi, Lo));
-}
-
 void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
                                          SmallVector<RangeSpan, 2> Range) {
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 3c2fb8d99db7..e38672792867 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -127,10 +127,6 @@ public:
   void addLocalLabelAddress(DIE &Die, dwarf::Attribute Attribute,
                             const MCSymbol *Label);
 
-  /// addSectionDelta - Add a label delta attribute data and value.
-  DIE::value_iterator addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
-                                      const MCSymbol *Hi, const MCSymbol *Lo);
-
   DwarfCompileUnit &getCU() override { return *this; }
 
   unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override;
@@ -151,12 +147,6 @@ public:
 
   void attachLowHighPC(DIE &D, const MCSymbol *Begin, const MCSymbol *End);
 
-  /// addSectionLabel - Add a Dwarf section label attribute data and value.
-  ///
-  DIE::value_iterator addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
-                                      const MCSymbol *Label,
-                                      const MCSymbol *Sec);
-
   /// \brief Find DIE for the given subprogram and attach appropriate
   /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
   /// variables in this scope then create and insert DIEs for these
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 708f5f7536ff..4f4ebfc56297 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1587,6 +1587,26 @@ void DwarfTypeUnit::emitHeader(bool UseOffsets) {
                                  sizeof(Ty->getOffset()));
 }
 
+DIE::value_iterator
+DwarfUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
+                           const MCSymbol *Hi, const MCSymbol *Lo) {
+  return Die.addValue(DIEValueAllocator, Attribute,
+                      DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                                 : dwarf::DW_FORM_data4,
+                      new (DIEValueAllocator) DIEDelta(Hi, Lo));
+}
+
+DIE::value_iterator
+DwarfUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                           const MCSymbol *Label, const MCSymbol *Sec) {
+  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+    return addLabel(Die, Attribute,
+                    DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                               : dwarf::DW_FORM_data4,
+                    Label);
+  return addSectionDelta(Die, Attribute, Label, Sec);
+}
+
 bool DwarfTypeUnit::isDwoUnit() const {
   // Since there are no skeleton type units, all type units are dwo type units
   // when split DWARF is being used.
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 7acad2cbd89f..4cc01b3298d4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -291,6 +291,15 @@ public:
 
   void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy);
 
+  /// addSectionDelta - Add a label delta attribute data and value.
+  DIE::value_iterator addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
+                                      const MCSymbol *Hi, const MCSymbol *Lo);
+
+  /// Add a Dwarf section label attribute data and value.
+  DIE::value_iterator addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                                      const MCSymbol *Label,
+                                      const MCSymbol *Sec);
+
 protected:
   ~DwarfUnit();
 
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index cb31c21293f4..b50e76f2e3ba 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -1662,6 +1662,7 @@ class MemCmpExpansion {
   PHINode *PhiRes;
   bool IsUsedForZeroCmp;
   const DataLayout &DL;
+  IRBuilder<> Builder;
 
   unsigned calculateNumBlocks(unsigned Size);
   void createLoadCmpBlocks();
@@ -1671,13 +1672,14 @@ class MemCmpExpansion {
   void emitLoadCompareBlock(unsigned Index, unsigned LoadSize,
                             unsigned GEPIndex);
   Value *getCompareLoadPairs(unsigned Index, unsigned Size,
-                             unsigned &NumBytesProcessed, IRBuilder<> &Builder);
+                             unsigned &NumBytesProcessed);
   void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size,
                                          unsigned &NumBytesProcessed);
   void emitLoadCompareByteBlock(unsigned Index, unsigned GEPIndex);
   void emitMemCmpResultBlock();
   Value *getMemCmpExpansionZeroCase(unsigned Size);
   Value *getMemCmpEqZeroOneBlock(unsigned Size);
+  Value *getMemCmpOneBlock(unsigned Size);
   unsigned getLoadSize(unsigned Size);
   unsigned getNumLoads(unsigned Size);
 
@@ -1702,7 +1704,7 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size,
                                  unsigned MaxLoadSize, unsigned LoadsPerBlock,
                                  const DataLayout &TheDataLayout)
     : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock),
-      DL(TheDataLayout) {
+      DL(TheDataLayout), Builder(CI) {
 
   // A memcmp with zero-comparison with only one block of load and compare does
   // not need to set up any extra blocks. This case could be handled in the DAG,
@@ -1710,7 +1712,7 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size,
   // we choose to handle this case too to avoid fragmented lowering.
   IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
   NumBlocks = calculateNumBlocks(Size);
-  if (!IsUsedForZeroCmp || NumBlocks != 1) {
+  if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || NumBlocks != 1) {
     BasicBlock *StartBlock = CI->getParent();
     EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
     setupEndBlockPHINodes();
@@ -1731,7 +1733,6 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size,
     StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
   }
 
-  IRBuilder<> Builder(CI->getContext());
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
 }
 
@@ -1754,8 +1755,6 @@ void MemCmpExpansion::createResultBlock() {
 // final phi node for selecting the memcmp result.
 void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index,
                                                unsigned GEPIndex) {
-  IRBuilder<> Builder(CI->getContext());
-
   Value *Source1 = CI->getArgOperand(0);
   Value *Source2 = CI->getArgOperand(1);
 
@@ -1811,8 +1810,7 @@ unsigned MemCmpExpansion::getLoadSize(unsigned Size) {
 /// This is used in the case where the memcmp() call is compared equal or not
 /// equal to zero.
 Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size,
-                                            unsigned &NumBytesProcessed,
-                                            IRBuilder<> &Builder) {
+                                            unsigned &NumBytesProcessed) {
   std::vector<Value *> XorList, OrList;
   Value *Diff;
 
@@ -1910,8 +1908,7 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size,
 
 void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(
     unsigned Index, unsigned Size, unsigned &NumBytesProcessed) {
-  IRBuilder<> Builder(CI->getContext());
-  Value *Cmp = getCompareLoadPairs(Index, Size, NumBytesProcessed, Builder);
+  Value *Cmp = getCompareLoadPairs(Index, Size, NumBytesProcessed);
 
   BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
                            ? EndBlock
@@ -1946,8 +1943,6 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize,
     return;
   }
 
-  IRBuilder<> Builder(CI->getContext());
-
   Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
   Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
   assert(LoadSize <= MaxLoadSize && "Unexpected load type");
@@ -1975,9 +1970,7 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize,
   Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
 
   if (DL.isLittleEndian()) {
-    Function *F = LoadCmpBlocks[Index]->getParent();
-
-    Function *Bswap = Intrinsic::getDeclaration(F->getParent(),
+    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
                                                 Intrinsic::bswap, LoadSizeType);
     LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
     LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
@@ -1995,16 +1988,13 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize,
     ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]);
   }
 
-  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
-
-  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
-                                  ConstantInt::get(Diff->getType(), 0));
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
   BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
                            ? EndBlock
                            : LoadCmpBlocks[Index + 1];
   // Early exit branch if difference found to ResultBlock. Otherwise, continue
   // to next LoadCmpBlock or EndBlock.
-  BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+  BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
   Builder.Insert(CmpBr);
 
   // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
@@ -2020,8 +2010,6 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize,
 // memcmp result. It compares the two loaded source values and returns -1 if
 // src1 < src2 and 1 if src1 > src2.
 void MemCmpExpansion::emitMemCmpResultBlock() {
-  IRBuilder<> Builder(CI->getContext());
-
   // Special case: if memcmp result is used in a zero equality, result does not
   // need to be calculated and can simply return 1.
   if (IsUsedForZeroCmp) {
@@ -2070,7 +2058,6 @@ unsigned MemCmpExpansion::calculateNumBlocks(unsigned Size) {
 }
 
 void MemCmpExpansion::setupResultBlockPHINodes() {
-  IRBuilder<> Builder(CI->getContext());
   Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
   Builder.SetInsertPoint(ResBlock.BB);
   ResBlock.PhiSrc1 =
@@ -2080,8 +2067,6 @@ void MemCmpExpansion::setupResultBlockPHINodes() {
 }
 
 void MemCmpExpansion::setupEndBlockPHINodes() {
-  IRBuilder<> Builder(CI->getContext());
-
   Builder.SetInsertPoint(&EndBlock->front());
   PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
 }
@@ -2102,11 +2087,45 @@ Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size) {
 /// in the general case.
 Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) {
   unsigned NumBytesProcessed = 0;
-  IRBuilder<> Builder(CI->getContext());
-  Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed, Builder);
+  Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed);
   return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
 }
 
+/// A memcmp expansion that only has one block of load and compare can bypass
+/// the compare, branch, and phi IR that is required in the general case.
+Value *MemCmpExpansion::getMemCmpOneBlock(unsigned Size) {
+  assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block");
+
+  Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  // Cast source to LoadSizeType*.
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Load LoadSizeType from the base address.
+  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  if (DL.isLittleEndian() && Size != 1) {
+    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
+                                                Intrinsic::bswap, LoadSizeType);
+    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
+    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
+  }
+
+  // TODO: Instead of comparing ULT, just subtract and return the difference?
+  Value *CmpNE = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
+  Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
+  Type *I32 = Builder.getInt32Ty();
+  Value *Sel1 = Builder.CreateSelect(CmpULT, ConstantInt::get(I32, -1),
+                                             ConstantInt::get(I32, 1));
+  return Builder.CreateSelect(CmpNE, Sel1, ConstantInt::get(I32, 0));
+}
+
 // This function expands the memcmp call into an inline expansion and returns
 // the memcmp result.
 Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) {
@@ -2114,6 +2133,10 @@ Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) {
     return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) :
                             getMemCmpExpansionZeroCase(Size);
 
+  // TODO: Handle more than one load pair per block in getMemCmpOneBlock().
+  if (NumBlocks == 1 && NumLoadsPerBlock == 1)
+    return getMemCmpOneBlock(Size);
+
   // This loop calls emitLoadCompareBlock for comparing Size bytes of the two
   // memcmp sources. It starts with loading using the maximum load size set by
   // the target. It processes any remaining bytes using a load size which is the
@@ -2218,7 +2241,6 @@ Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) {
 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
                          const TargetLowering *TLI, const DataLayout *DL) {
   NumMemCmpCalls++;
-  IRBuilder<> Builder(CI->getContext());
 
   // TTI call to check if target would like to expand memcmp. Also, get the
   // MaxLoadSize.
@@ -4378,14 +4400,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // If the real base value actually came from an inttoptr, then the matcher
     // will look through it and provide only the integer value. In that case,
     // use it here.
-    if (!ResultPtr && AddrMode.BaseReg) {
-      ResultPtr =
-        Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), "sunkaddr");
-      AddrMode.BaseReg = nullptr;
-    } else if (!ResultPtr && AddrMode.Scale == 1) {
-      ResultPtr =
-        Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), "sunkaddr");
-      AddrMode.Scale = 0;
+    if (!DL->isNonIntegralPointerType(Addr->getType())) {
+      if (!ResultPtr && AddrMode.BaseReg) {
+        ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(),
+                                           "sunkaddr");
+        AddrMode.BaseReg = nullptr;
+      } else if (!ResultPtr && AddrMode.Scale == 1) {
+        ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(),
+                                           "sunkaddr");
+        AddrMode.Scale = 0;
+      }
     }
 
     if (!ResultPtr &&
@@ -4466,6 +4490,19 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
     }
   } else {
+    // We'd require a ptrtoint/inttoptr down the line, which we can't do for
+    // non-integral pointers, so in that case bail out now.
+    Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;
+    Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;
+    PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy);
+    PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy);
+    if (DL->isNonIntegralPointerType(Addr->getType()) ||
+        (BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) ||
+        (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||
+        (AddrMode.BaseGV &&
+         DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
+      return false;
+
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst << "\n");
     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
@@ -6367,7 +6404,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
     }
 
     // Update PHI nodes in both successors. The original BB needs to be
-    // replaced in one succesor's PHI nodes, because the branch comes now from
+    // replaced in one successor's PHI nodes, because the branch comes now from
     // the newly generated BB (NewBB). In the other successor we need to add one
     // incoming edge to the PHI nodes, because both branch instructions target
     // now the same successor. Depending on the original branch condition
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 239bad2f5355..521037f9d206 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator --*- C++ -*-==//
+//===- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator ---*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,34 +11,69 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
-
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "irtranslator"
 
 using namespace llvm;
 
 char IRTranslator::ID = 0;
+
 INITIALIZE_PASS_BEGIN(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",
                 false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
@@ -62,7 +97,7 @@ static void reportTranslationError(MachineFunction &MF,
     ORE.emit(R);
 }
 
-IRTranslator::IRTranslator() : MachineFunctionPass(ID), MRI(nullptr) {
+IRTranslator::IRTranslator() : MachineFunctionPass(ID) {
   initializeIRTranslatorPass(*PassRegistry::getPassRegistry());
 }
 
@@ -71,7 +106,6 @@ void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-
 unsigned IRTranslator::getOrCreateVReg(const Value &Val) {
   unsigned &ValReg = ValToVReg[&Val];
 
@@ -686,6 +720,26 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
         .addUse(getOrCreateVReg(*CI.getArgOperand(1)));
     return true;
+  case Intrinsic::exp:
+    MIRBuilder.buildInstr(TargetOpcode::G_FEXP)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    return true;
+  case Intrinsic::exp2:
+    MIRBuilder.buildInstr(TargetOpcode::G_FEXP2)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    return true;
+  case Intrinsic::log:
+    MIRBuilder.buildInstr(TargetOpcode::G_FLOG)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    return true;
+  case Intrinsic::log2:
+    MIRBuilder.buildInstr(TargetOpcode::G_FLOG2)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    return true;
   case Intrinsic::fma:
     MIRBuilder.buildInstr(TargetOpcode::G_FMA)
         .addDef(getOrCreateVReg(CI))
@@ -834,7 +888,6 @@ bool IRTranslator::translateInvoke(const User &U,
   if (!isa<LandingPadInst>(EHPadBB->front()))
     return false;
 
-
   // Emit the actual call, bracketed by EH_LABELs so that the MF knows about
   // the region covered by the try.
   MCSymbol *BeginSymbol = Context.createTempSymbol();
@@ -1195,7 +1248,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   MRI = &MF->getRegInfo();
   DL = &F.getParent()->getDataLayout();
   TPC = &getAnalysis<TargetPassConfig>();
-  ORE = make_unique<OptimizationRemarkEmitter>(&F);
+  ORE = llvm::make_unique<OptimizationRemarkEmitter>(&F);
 
   assert(PendingPHIs.empty() && "stale PHIs");
 
diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 5466efd7e90f..860fc9a4f8b6 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -1,4 +1,4 @@
-//===- llvm/CodeGen/GlobalISel/InstructionSelector.cpp -----------*- C++ -*-==//
+//===- llvm/CodeGen/GlobalISel/InstructionSelector.cpp --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,19 +11,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
 
 #define DEBUG_TYPE "instructionselector"
 
 using namespace llvm;
 
-InstructionSelector::InstructionSelector() {}
+InstructionSelector::InstructionSelector() = default;
 
 bool InstructionSelector::constrainOperandRegToRegClass(
     MachineInstr &I, unsigned OpIdx, const TargetRegisterClass &RC,
@@ -33,8 +36,8 @@ bool InstructionSelector::constrainOperandRegToRegClass(
   MachineFunction &MF = *MBB.getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  return llvm::constrainRegToClass(MRI, TII, RBI, I,
-                                   I.getOperand(OpIdx).getReg(), RC);
+  return
+      constrainRegToClass(MRI, TII, RBI, I, I.getOperand(OpIdx).getReg(), RC);
 }
 
 bool InstructionSelector::constrainSelectedInstRegOperands(
@@ -84,7 +87,6 @@ bool InstructionSelector::constrainSelectedInstRegOperands(
 bool InstructionSelector::isOperandImmEqual(
     const MachineOperand &MO, int64_t Value,
     const MachineRegisterInfo &MRI) const {
-
   if (MO.isReg() && MO.getReg())
     if (auto VRegVal = getConstantVRegVal(MO.getReg(), MRI))
       return *VRegVal == Value;
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 1d0d3dffa4c5..84b0a0ac4157 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -158,7 +158,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
                                                               unsigned TypeIdx,
                                                               LLT NarrowTy) {
   // FIXME: Don't know how to handle secondary types yet.
-  if (TypeIdx != 0)
+  if (TypeIdx != 0 && MI.getOpcode() != TargetOpcode::G_EXTRACT)
     return UnableToLegalize;
 
   MIRBuilder.setInstr(MI);
@@ -166,6 +166,20 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
+  case TargetOpcode::G_IMPLICIT_DEF: {
+    int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() /
+                   NarrowTy.getSizeInBits();
+
+    SmallVector<unsigned, 2> DstRegs;
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned Dst = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.buildUndef(Dst);
+      DstRegs.push_back(Dst);
+    }
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_ADD: {
     // Expand in terms of carry-setting/consuming G_ADDE instructions.
     int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() /
@@ -193,6 +207,58 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_EXTRACT: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    int64_t NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(1).getReg()).getSizeInBits() / NarrowSize;
+
+    SmallVector<unsigned, 2> SrcRegs, DstRegs;
+    SmallVector<uint64_t, 2> Indexes;
+    extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
+
+    unsigned OpReg = MI.getOperand(0).getReg();
+    int64_t OpStart = MI.getOperand(2).getImm();
+    int64_t OpSize = MRI.getType(OpReg).getSizeInBits();
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned SrcStart = i * NarrowSize;
+
+      if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
+        // No part of the extract uses this subregister, ignore it.
+        continue;
+      } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
+        // The entire subregister is extracted, forward the value.
+        DstRegs.push_back(SrcRegs[i]);
+        continue;
+      }
+
+      // OpSegStart is where this destination segment would start in OpReg if it
+      // extended infinitely in both directions.
+      int64_t ExtractOffset, SegSize;
+      if (OpStart < SrcStart) {
+        ExtractOffset = 0;
+        SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
+      } else {
+        ExtractOffset = OpStart - SrcStart;
+        SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
+      }
+
+      unsigned SegReg = SrcRegs[i];
+      if (ExtractOffset != 0 || SegSize != NarrowSize) {
+        // A genuine extract is needed.
+        SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
+        MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
+      }
+
+      DstRegs.push_back(SegReg);
+    }
+
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_INSERT: {
     if (TypeIdx != 0)
       return UnableToLegalize;
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 595802f2228b..76917aa9660d 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -1,4 +1,4 @@
-//===---- lib/CodeGen/GlobalISel/LegalizerInfo.cpp - Legalizer -------==//
+//===- lib/CodeGen/GlobalISel/LegalizerInfo.cpp - Legalizer ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,16 +18,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Type.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOpcodes.h"
+#include <algorithm>
+#include <cassert>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
-LegalizerInfo::LegalizerInfo() : TablesInitialized(false) {
+LegalizerInfo::LegalizerInfo() {
+  DefaultActions[TargetOpcode::G_IMPLICIT_DEF] = NarrowScalar;
+
   // FIXME: these two can be legalized to the fundamental load/store Jakob
   // proposed. Once loads & stores are supported.
   DefaultActions[TargetOpcode::G_ANYEXT] = Legal;
@@ -42,6 +51,7 @@ LegalizerInfo::LegalizerInfo() : TablesInitialized(false) {
 
   DefaultActions[TargetOpcode::G_BRCOND] = WidenScalar;
   DefaultActions[TargetOpcode::G_INSERT] = NarrowScalar;
+  DefaultActions[TargetOpcode::G_EXTRACT] = NarrowScalar;
   DefaultActions[TargetOpcode::G_FNEG] = Lower;
 }
 
@@ -75,8 +85,7 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const {
 
   // FIXME: the long-term plan calls for expansion in terms of load/store (if
   // they're not legal).
-  if (Aspect.Opcode == TargetOpcode::G_EXTRACT ||
-      Aspect.Opcode == TargetOpcode::G_MERGE_VALUES ||
+  if (Aspect.Opcode == TargetOpcode::G_MERGE_VALUES ||
       Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES)
     return std::make_pair(Legal, Aspect.Type);
 
@@ -172,21 +181,21 @@ Optional<LLT> LegalizerInfo::findLegalType(const InstrAspect &Aspect,
   case Custom:
     return Aspect.Type;
   case NarrowScalar: {
-    return findLegalType(Aspect,
-                         [](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
+    return findLegalizableSize(
+        Aspect, [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
   }
   case WidenScalar: {
-    return findLegalType(Aspect, [](LLT Ty) -> LLT {
+    return findLegalizableSize(Aspect, [&](LLT Ty) -> LLT {
       return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize();
     });
   }
   case FewerElements: {
-    return findLegalType(Aspect,
-                         [](LLT Ty) -> LLT { return Ty.halfElements(); });
+    return findLegalizableSize(
+        Aspect, [&](LLT Ty) -> LLT { return Ty.halfElements(); });
   }
   case MoreElements: {
-    return findLegalType(Aspect,
-                         [](LLT Ty) -> LLT { return Ty.doubleElements(); });
+    return findLegalizableSize(
+        Aspect, [&](LLT Ty) -> LLT { return Ty.doubleElements(); });
   }
   }
 }
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 3c70013ea296..47c6214c0552 100644
--- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -264,10 +264,13 @@ MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) {
 }
 
 MachineInstrBuilder MachineIRBuilder::buildBrIndirect(unsigned Tgt) {
+  assert(MRI->getType(Tgt).isPointer() && "invalid branch destination");
   return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildCopy(unsigned Res, unsigned Op) {
+  assert(MRI->getType(Res) == LLT() || MRI->getType(Op) == LLT() ||
+         MRI->getType(Res) == MRI->getType(Op));
   return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op);
 }
 
@@ -364,27 +367,36 @@ MachineInstrBuilder MachineIRBuilder::buildZExt(unsigned Res, unsigned Op) {
 
 MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(unsigned Res,
                                                        unsigned Op) {
+  assert(MRI->getType(Res).isScalar() || MRI->getType(Res).isVector());
+  assert(MRI->getType(Res).isScalar() == MRI->getType(Op).isScalar());
+
   unsigned Opcode = TargetOpcode::COPY;
   if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits())
     Opcode = TargetOpcode::G_SEXT;
   else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits())
     Opcode = TargetOpcode::G_TRUNC;
+  else
+    assert(MRI->getType(Res) == MRI->getType(Op));
 
   return buildInstr(Opcode).addDef(Res).addUse(Op);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(unsigned Res,
                                                        unsigned Op) {
+  assert(MRI->getType(Res).isScalar() || MRI->getType(Res).isVector());
+  assert(MRI->getType(Res).isScalar() == MRI->getType(Op).isScalar());
+
   unsigned Opcode = TargetOpcode::COPY;
   if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits())
     Opcode = TargetOpcode::G_ZEXT;
   else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits())
     Opcode = TargetOpcode::G_TRUNC;
+  else
+    assert(MRI->getType(Res) == MRI->getType(Op));
 
   return buildInstr(Opcode).addDef(Res).addUse(Op);
 }
 
-
 MachineInstrBuilder MachineIRBuilder::buildCast(unsigned Dst, unsigned Src) {
   LLT SrcTy = MRI->getType(Src);
   LLT DstTy = MRI->getType(Dst);
@@ -466,7 +478,7 @@ void MachineIRBuilder::buildSequence(unsigned Res, ArrayRef<unsigned> Ops,
 }
 
 MachineInstrBuilder MachineIRBuilder::buildUndef(unsigned Res) {
-  return buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Res);
+  return buildInstr(TargetOpcode::G_IMPLICIT_DEF).addDef(Res);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res,
@@ -482,6 +494,9 @@ MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res,
          "input operands do not cover output register");
 #endif
 
+  if (Ops.size() == 1)
+    return buildCast(Res, Ops[0]);
+
   MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_MERGE_VALUES);
   MIB.addDef(Res);
   for (unsigned i = 0; i < Ops.size(); ++i)
@@ -511,8 +526,11 @@ MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res,
 
 MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src,
                                                   unsigned Op, unsigned Index) {
+  assert(Index + MRI->getType(Op).getSizeInBits() <=
+             MRI->getType(Res).getSizeInBits() &&
+         "insertion past the end of a register");
+
   if (MRI->getType(Res).getSizeInBits() == MRI->getType(Op).getSizeInBits()) {
-    assert(Index == 0 && "insertion past the end of a register");
     return buildCast(Res, Op);
   }
 
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 2eb3cdee694d..677941dbbf6d 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -1,4 +1,4 @@
-//===- llvm/CodeGen/GlobalISel/RegBankSelect.cpp - RegBankSelect -*- C++ -*-==//
+//==- llvm/CodeGen/GlobalISel/RegBankSelect.cpp - RegBankSelect --*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +12,39 @@
 
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
 
 #define DEBUG_TYPE "regbankselect"
 
@@ -37,6 +58,7 @@ static cl::opt<RegBankSelect::Mode> RegBankSelectMode(
                           "Use the Greedy mode (best local mapping)")));
 
 char RegBankSelect::ID = 0;
+
 INITIALIZE_PASS_BEGIN(RegBankSelect, DEBUG_TYPE,
                       "Assign register bank of generic virtual registers",
                       false, false);
@@ -48,8 +70,7 @@ INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,
                     false)
 
 RegBankSelect::RegBankSelect(Mode RunningMode)
-    : MachineFunctionPass(ID), RBI(nullptr), MRI(nullptr), TRI(nullptr),
-      MBFI(nullptr), MBPI(nullptr), OptMode(RunningMode) {
+    : MachineFunctionPass(ID), OptMode(RunningMode) {
   initializeRegBankSelectPass(*PassRegistry::getPassRegistry());
   if (RegBankSelectMode.getNumOccurrences() != 0) {
     OptMode = RegBankSelectMode;
@@ -72,7 +93,7 @@ void RegBankSelect::init(MachineFunction &MF) {
     MBPI = nullptr;
   }
   MIRBuilder.setMF(MF);
-  MORE = make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
+  MORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
 }
 
 void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -133,9 +154,11 @@ bool RegBankSelect::repairReg(
           TargetRegisterInfo::isPhysicalRegister(Dst)) &&
          "We are about to create several defs for Dst");
 
-  // Build the instruction used to repair, then clone it at the right places.
-  MachineInstr *MI = MIRBuilder.buildCopy(Dst, Src);
-  MI->removeFromParent();
+  // Build the instruction used to repair, then clone it at the right
+  // places. Avoiding buildCopy bypasses the check that Src and Dst have the
+  // same types because the type is a placeholder when this function is called.
+  MachineInstr *MI =
+      MIRBuilder.buildInstrNoInsert(TargetOpcode::COPY).addDef(Dst).addUse(Src);
   DEBUG(dbgs() << "Copy: " << PrintReg(Src) << " to: " << PrintReg(Dst)
                << '\n');
   // TODO:
@@ -202,11 +225,11 @@ uint64_t RegBankSelect::getRepairCost(
         RBI->copyCost(*DesiredRegBrank, *CurRegBank,
                       RegisterBankInfo::getSizeInBits(MO.getReg(), *MRI, *TRI));
     // TODO: use a dedicated constant for ImpossibleCost.
-    if (Cost != UINT_MAX)
+    if (Cost != std::numeric_limits<unsigned>::max())
       return Cost;
     // Return the legalization cost of that repairing.
   }
-  return UINT_MAX;
+  return std::numeric_limits<unsigned>::max();
 }
 
 const RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping(
@@ -352,7 +375,7 @@ void RegBankSelect::tryAvoidingSplit(
       // the repairing cost because of the PHIs already proceeded
       // as already stated.
       // Though the code will be correct.
-      assert(0 && "Repairing cost may not be accurate");
+      assert(false && "Repairing cost may not be accurate");
     } else {
       // We need to do non-local repairing. Basically, patch all
       // the uses (i.e., phis) that we already proceeded.
@@ -450,7 +473,7 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
     uint64_t RepairCost = getRepairCost(MO, ValMapping);
 
     // This is an impossible to repair cost.
-    if (RepairCost == UINT_MAX)
+    if (RepairCost == std::numeric_limits<unsigned>::max())
       continue;
 
     // Bias used for splitting: 5%.
@@ -535,9 +558,11 @@ bool RegBankSelect::applyMapping(
       llvm_unreachable("Other kind should not happen");
     }
   }
+
   // Second, rewrite the instruction.
   DEBUG(dbgs() << "Actual mapping of the operands: " << OpdMapper << '\n');
   RBI->applyMapping(OpdMapper);
+
   return true;
 }
 
@@ -638,11 +663,8 @@ RegBankSelect::RepairingPlacement::RepairingPlacement(
     MachineInstr &MI, unsigned OpIdx, const TargetRegisterInfo &TRI, Pass &P,
     RepairingPlacement::RepairingKind Kind)
     // Default is, we are going to insert code to repair OpIdx.
-    : Kind(Kind),
-      OpIdx(OpIdx),
-      CanMaterialize(Kind != RepairingKind::Impossible),
-      HasSplit(false),
-      P(P) {
+    : Kind(Kind), OpIdx(OpIdx),
+      CanMaterialize(Kind != RepairingKind::Impossible), P(P) {
   const MachineOperand &MO = MI.getOperand(OpIdx);
   assert(MO.isReg() && "Trying to repair a non-reg operand");
 
@@ -847,7 +869,7 @@ bool RegBankSelect::EdgeInsertPoint::canMaterialize() const {
 }
 
 RegBankSelect::MappingCost::MappingCost(const BlockFrequency &LocalFreq)
-    : LocalCost(0), NonLocalCost(0), LocalFreq(LocalFreq.getFrequency()) {}
+    : LocalFreq(LocalFreq.getFrequency()) {}
 
 bool RegBankSelect::MappingCost::addLocalCost(uint64_t Cost) {
   // Check if this overflows.
@@ -920,7 +942,6 @@ bool RegBankSelect::MappingCost::operator<(const MappingCost &Cost) const {
       OtherLocalAdjust = Cost.LocalCost - LocalCost;
     else
       ThisLocalAdjust = LocalCost - Cost.LocalCost;
-
   } else {
     ThisLocalAdjust = LocalCost;
     OtherLocalAdjust = Cost.LocalCost;
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index 398066bf8903..8c43c9f3f884 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -20,11 +20,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
 
+// Reserve an address that indicates a value that is known to be "undef".
+static VNInfo UndefVNI(0xbad, SlotIndex());
+
 void LiveRangeCalc::resetLiveOutMap() {
   unsigned NumBlocks = MF->getNumBlockIDs();
   Seen.clear();
   Seen.resize(NumBlocks);
-  EntryInfoMap.clear();
+  EntryInfos.clear();
   Map.resize(NumBlocks);
 }
 
@@ -283,8 +286,11 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs,
     // Determine if the exit from the block is reached by some def.
     unsigned N = WorkList[i];
     MachineBasicBlock &B = *MF->getBlockNumbered(N);
-    if (Seen[N] && Map[&B].first != nullptr)
-      return MarkDefined(B);
+    if (Seen[N]) {
+      const LiveOutPair &LOB = Map[&B];
+      if (LOB.first != nullptr && LOB.first != &UndefVNI)
+        return MarkDefined(B);
+    }
     SlotIndex Begin, End;
     std::tie(Begin, End) = Indexes->getMBBRange(&B);
     // Treat End as not belonging to B.
@@ -365,10 +371,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
 #endif
     FoundUndef |= MBB->pred_empty();
 
-    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-       MachineBasicBlock *Pred = *PI;
-
+    for (MachineBasicBlock *Pred : MBB->predecessors()) {
        // Is this a known live-out block?
        if (Seen.test(Pred->getNumber())) {
          if (VNInfo *VNI = Map[Pred].first) {
@@ -387,7 +390,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
        auto EP = LR.extendInBlock(Undefs, Start, End);
        VNInfo *VNI = EP.first;
        FoundUndef |= EP.second;
-       setLiveOutValue(Pred, VNI);
+       setLiveOutValue(Pred, EP.second ? &UndefVNI : VNI);
        if (VNI) {
          if (TheVNI && TheVNI != VNI)
            UniqueVNI = false;
@@ -406,7 +409,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
   }
 
   LiveIn.clear();
-  FoundUndef |= (TheVNI == nullptr);
+  FoundUndef |= (TheVNI == nullptr || TheVNI == &UndefVNI);
   if (Undefs.size() > 0 && FoundUndef)
     UniqueVNI = false;
 
@@ -417,7 +420,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
 
   // If a unique reaching def was found, blit in the live ranges immediately.
   if (UniqueVNI) {
-    assert(TheVNI != nullptr);
+    assert(TheVNI != nullptr && TheVNI != &UndefVNI);
     LiveRangeUpdater Updater(&LR);
     for (unsigned BN : WorkList) {
       SlotIndex Start, End;
@@ -433,22 +436,26 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
   }
 
   // Prepare the defined/undefined bit vectors.
-  auto EF = EntryInfoMap.find(&LR);
-  if (EF == EntryInfoMap.end()) {
+  EntryInfoMap::iterator Entry;
+  bool DidInsert;
+  std::tie(Entry, DidInsert) = EntryInfos.insert(
+      std::make_pair(&LR, std::make_pair(BitVector(), BitVector())));
+  if (DidInsert) {
+    // Initialize newly inserted entries.
     unsigned N = MF->getNumBlockIDs();
-    EF = EntryInfoMap.insert({&LR, {BitVector(), BitVector()}}).first;
-    EF->second.first.resize(N);
-    EF->second.second.resize(N);
+    Entry->second.first.resize(N);
+    Entry->second.second.resize(N);
   }
-  BitVector &DefOnEntry = EF->second.first;
-  BitVector &UndefOnEntry = EF->second.second;
+  BitVector &DefOnEntry = Entry->second.first;
+  BitVector &UndefOnEntry = Entry->second.second;
 
   // Multiple values were found, so transfer the work list to the LiveIn array
   // where UpdateSSA will use it as a work list.
   LiveIn.reserve(WorkList.size());
   for (unsigned BN : WorkList) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(BN);
-    if (Undefs.size() > 0 && !isDefOnEntry(LR, Undefs, *MBB, DefOnEntry, UndefOnEntry))
+    if (Undefs.size() > 0 &&
+        !isDefOnEntry(LR, Undefs, *MBB, DefOnEntry, UndefOnEntry))
       continue;
     addLiveInBlock(LR, DomTree->getNode(MBB));
     if (MBB == &UseMBB)
@@ -466,9 +473,9 @@ void LiveRangeCalc::updateSSA() {
   assert(DomTree && "Missing dominator tree");
 
   // Interate until convergence.
-  unsigned Changes;
+  bool Changed;
   do {
-    Changes = 0;
+    Changed = false;
     // Propagate live-out values down the dominator tree, inserting phi-defs
     // when necessary.
     for (LiveInBlock &I : LiveIn) {
@@ -491,15 +498,20 @@ void LiveRangeCalc::updateSSA() {
         IDomValue = Map[IDom->getBlock()];
 
         // Cache the DomTree node that defined the value.
-        if (IDomValue.first && !IDomValue.second)
+        if (IDomValue.first && IDomValue.first != &UndefVNI &&
+            !IDomValue.second) {
           Map[IDom->getBlock()].second = IDomValue.second =
             DomTree->getNode(Indexes->getMBBFromIndex(IDomValue.first->def));
+        }
 
-        for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-               PE = MBB->pred_end(); PI != PE; ++PI) {
-          LiveOutPair &Value = Map[*PI];
+        for (MachineBasicBlock *Pred : MBB->predecessors()) {
+          LiveOutPair &Value = Map[Pred];
           if (!Value.first || Value.first == IDomValue.first)
             continue;
+          if (Value.first == &UndefVNI) {
+            needPHI = true;
+            break;
+          }
 
           // Cache the DomTree node that defined the value.
           if (!Value.second)
@@ -523,7 +535,7 @@ void LiveRangeCalc::updateSSA() {
 
       // Create a phi-def if required.
       if (needPHI) {
-        ++Changes;
+        Changed = true;
         assert(Alloc && "Need VNInfo allocator to create PHI-defs");
         SlotIndex Start, End;
         std::tie(Start, End) = Indexes->getMBBRange(MBB);
@@ -542,7 +554,7 @@ void LiveRangeCalc::updateSSA() {
             LR.addSegment(LiveInterval::Segment(Start, End, VNI));
           LOP = LiveOutPair(VNI, Node);
         }
-      } else if (IDomValue.first) {
+      } else if (IDomValue.first && IDomValue.first != &UndefVNI) {
         // No phi-def here. Remember incoming value.
         I.Value = IDomValue.first;
 
@@ -554,9 +566,9 @@ void LiveRangeCalc::updateSSA() {
         // MBB is live-out and doesn't define its own value.
         if (LOP.first == IDomValue.first)
           continue;
-        ++Changes;
+        Changed = true;
         LOP = IDomValue;
       }
     }
-  } while (Changes);
+  } while (Changed);
 }
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index 1a7598f8044a..d41b782d9bdf 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -24,6 +24,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/CodeGen/LiveInterval.h"
 
@@ -65,7 +66,8 @@ class LiveRangeCalc {
   /// registers do not overlap), but the defined/undefined information must
   /// be kept separate for each individual range.
   /// By convention, EntryInfoMap[&LR] = { Defined, Undefined }.
-  std::map<LiveRange*,std::pair<BitVector,BitVector>> EntryInfoMap;
+  typedef DenseMap<LiveRange*,std::pair<BitVector,BitVector>> EntryInfoMap;
+  EntryInfoMap EntryInfos;
 
   /// Map each basic block where a live range is live out to the live-out value
   /// and its defining block.
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index f58d1f8b83ae..c58d192284dd 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -579,12 +579,12 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB,
   //
   // is equivalent to
   //   liveins: %edi, %esi
-  bool ExplicitSuccesors = false;
+  bool ExplicitSuccessors = false;
   while (true) {
     if (Token.is(MIToken::kw_successors)) {
       if (parseBasicBlockSuccessors(MBB))
         return true;
-      ExplicitSuccesors = true;
+      ExplicitSuccessors = true;
     } else if (Token.is(MIToken::kw_liveins)) {
       if (parseBasicBlockLiveins(MBB))
         return true;
@@ -636,7 +636,7 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB,
   }
 
   // Construct successor list by searching for basic block machine operands.
-  if (!ExplicitSuccesors) {
+  if (!ExplicitSuccessors) {
     SmallVector<MachineBasicBlock*,4> Successors;
     bool IsFallthrough;
     guessSuccessors(MBB, Successors, IsFallthrough);
diff --git a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
index 6b6b5f2814a9..73c3428a6e53 100644
--- a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
+++ b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
@@ -52,6 +52,14 @@ void MachineOptimizationRemarkEmitter::emit(
   computeHotness(OptDiag);
 
   LLVMContext &Ctx = MF.getFunction()->getContext();
+
+  // If a diagnostic has a hotness value, then only emit it if its hotness
+  // meets the threshold.
+  if (OptDiag.getHotness() &&
+      *OptDiag.getHotness() < Ctx.getDiagnosticsHotnessThreshold()) {
+    return;
+  }
+
   yaml::Output *Out = Ctx.getDiagnosticsOutputFile();
   if (Out) {
     auto *P = &const_cast<DiagnosticInfoOptimizationBase &>(OptDiagCommon);
@@ -73,7 +81,7 @@ bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction(
     MachineFunction &MF) {
   MachineBlockFrequencyInfo *MBFI;
 
-  if (MF.getFunction()->getContext().getDiagnosticHotnessRequested())
+  if (MF.getFunction()->getContext().getDiagnosticsHotnessRequested())
     MBFI = &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI();
   else
     MBFI = nullptr;
diff --git a/lib/CodeGen/MacroFusion.cpp b/lib/CodeGen/MacroFusion.cpp
index 45ea0e4c39ab..5e279b065bbd 100644
--- a/lib/CodeGen/MacroFusion.cpp
+++ b/lib/CodeGen/MacroFusion.cpp
@@ -1,4 +1,4 @@
-//===- MacroFusion.cpp - Macro Fusion ----------------------===//
+//===- MacroFusion.cpp - Macro Fusion -------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,8 +13,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
 #define DEBUG_TYPE "misched"
@@ -26,8 +33,6 @@ using namespace llvm;
 static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
   cl::desc("Enable scheduling for macro fusion."), cl::init(true));
 
-namespace {
-
 static void fuseInstructionPair(ScheduleDAGMI &DAG, SUnit &FirstSU,
                                 SUnit &SecondSU) {
   // Create a single weak edge between the adjacent instrs. The only effect is
@@ -66,6 +71,7 @@ static void fuseInstructionPair(ScheduleDAGMI &DAG, SUnit &FirstSU,
   ++NumFused;
 }
 
+namespace {
 
 /// \brief Post-process the DAG to create cluster edges between instrs that may
 /// be fused by the processor into a single operation.
@@ -81,6 +87,8 @@ public:
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 };
 
+} // end anonymous namespace
+
 void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
   ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
 
@@ -128,23 +136,18 @@ bool MacroFusion::scheduleAdjacentImpl(ScheduleDAGMI &DAG, SUnit &AnchorSU) {
   return false;
 }
 
-} // end anonymous namespace
-
-
-namespace llvm {
-
 std::unique_ptr<ScheduleDAGMutation>
-createMacroFusionDAGMutation(ShouldSchedulePredTy shouldScheduleAdjacent) {
+llvm::createMacroFusionDAGMutation(
+     ShouldSchedulePredTy shouldScheduleAdjacent) {
   if(EnableMacroFusion)
     return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, true);
   return nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
-createBranchMacroFusionDAGMutation(ShouldSchedulePredTy shouldScheduleAdjacent) {
+llvm::createBranchMacroFusionDAGMutation(
+     ShouldSchedulePredTy shouldScheduleAdjacent) {
   if(EnableMacroFusion)
     return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, false);
   return nullptr;
 }
-
-} // end namespace llvm
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index da8fac6d3834..b13f6b68c420 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -76,6 +76,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -119,6 +120,14 @@ static cl::opt<unsigned> RewritePHILimit(
     "rewrite-phi-limit", cl::Hidden, cl::init(10),
     cl::desc("Limit the length of PHI chains to lookup"));
 
+// Limit the length of recurrence chain when evaluating the benefit of
+// commuting operands.
+static cl::opt<unsigned> MaxRecurrenceChain(
+    "recurrence-chain-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum length of recurrence chain when evaluating the benefit "
+             "of commuting operands"));
+
+
 STATISTIC(NumReuse,      "Number of extension results reused");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
@@ -131,12 +140,14 @@ STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed");
 namespace {
 
   class ValueTrackerResult;
+  class RecurrenceInstr;
 
   class PeepholeOptimizer : public MachineFunctionPass {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     MachineRegisterInfo   *MRI;
     MachineDominatorTree  *DT;  // Machine dominator tree
+    MachineLoopInfo       *MLI;
 
   public:
     static char ID; // Pass identification
@@ -150,6 +161,8 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
       if (Aggressive) {
         AU.addRequired<MachineDominatorTree>();
         AU.addPreserved<MachineDominatorTree>();
@@ -160,6 +173,9 @@ namespace {
     typedef SmallDenseMap<TargetInstrInfo::RegSubRegPair, ValueTrackerResult>
         RewriteMapTy;
 
+    /// \brief Sequence of instructions that formulate recurrence cycle.
+    typedef SmallVector<RecurrenceInstr, 4> RecurrenceCycle;
+
   private:
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
@@ -170,6 +186,7 @@ namespace {
     bool optimizeCoalescableCopy(MachineInstr *MI);
     bool optimizeUncoalescableCopy(MachineInstr *MI,
                                    SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool optimizeRecurrence(MachineInstr &PHI);
     bool findNextSource(unsigned Reg, unsigned SubReg,
                         RewriteMapTy &RewriteMap);
     bool isMoveImmediate(MachineInstr *MI,
@@ -178,6 +195,13 @@ namespace {
     bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                        SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+    /// \brief Finds recurrence cycles, but only ones that formulated around
+    /// a def operand and a use operand that are tied. If there is a use
+    /// operand commutable with the tied use operand, find recurrence cycle
+    /// along that operand as well.
+    bool findTargetRecurrence(unsigned Reg,
+                              const SmallSet<unsigned, 2> &TargetReg,
+                              RecurrenceCycle &RC);
 
     /// \brief If copy instruction \p MI is a virtual register copy, track it in
     /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
@@ -222,6 +246,28 @@ namespace {
     }
   };
 
+  /// \brief Helper class to hold instructions that are inside recurrence
+  /// cycles. The recurrence cycle is formulated around 1) a def operand and its
+  /// tied use operand, or 2) a def operand and a use operand that is commutable
+  /// with another use operand which is tied to the def operand. In the latter
+  /// case, index of the tied use operand and the commutable use operand are
+  /// maintained with CommutePair.
+  class RecurrenceInstr {
+  public:
+    typedef std::pair<unsigned, unsigned> IndexPair;
+
+    RecurrenceInstr(MachineInstr *MI) : MI(MI) {}
+    RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2)
+      : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {}
+
+    MachineInstr *getMI() const { return MI; }
+    Optional<IndexPair> getCommutePair() const { return CommutePair; }
+
+  private:
+    MachineInstr *MI;
+    Optional<IndexPair> CommutePair;
+  };
+
   /// \brief Helper class to hold a reply for ValueTracker queries. Contains the
   /// returned sources for a given search and the instructions where the sources
   /// were tracked from.
@@ -412,6 +458,7 @@ char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID;
 INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 
@@ -1487,6 +1534,113 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
   return false;
 }
 
+/// \bried Returns true if \p MO is a virtual register operand.
+static bool isVirtualRegisterOperand(MachineOperand &MO) {
+  if (!MO.isReg())
+    return false;
+  return TargetRegisterInfo::isVirtualRegister(MO.getReg());
+}
+
+bool PeepholeOptimizer::findTargetRecurrence(
+    unsigned Reg, const SmallSet<unsigned, 2> &TargetRegs,
+    RecurrenceCycle &RC) {
+  // Recurrence found if Reg is in TargetRegs.
+  if (TargetRegs.count(Reg))
+    return true;
+
+  // TODO: Curerntly, we only allow the last instruction of the recurrence
+  // cycle (the instruction that feeds the PHI instruction) to have more than
+  // one uses to guarantee that commuting operands does not tie registers
+  // with overlapping live range. Once we have actual live range info of
+  // each register, this constraint can be relaxed.
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  // Give up if the reccurrence chain length is longer than the limit.
+  if (RC.size() >= MaxRecurrenceChain)
+    return false;
+
+  MachineInstr &MI = *(MRI->use_instr_nodbg_begin(Reg));
+  unsigned Idx = MI.findRegisterUseOperandIdx(Reg);
+
+  // Only interested in recurrences whose instructions have only one def, which
+  // is a virtual register.
+  if (MI.getDesc().getNumDefs() != 1)
+    return false;
+
+  MachineOperand &DefOp = MI.getOperand(0);
+  if (!isVirtualRegisterOperand(DefOp))
+    return false;
+
+  // Check if def operand of MI is tied to any use operand. We are only
+  // interested in the case that all the instructions in the recurrence chain
+  // have there def operand tied with one of the use operand.
+  unsigned TiedUseIdx;
+  if (!MI.isRegTiedToUseOperand(0, &TiedUseIdx))
+    return false;
+
+  if (Idx == TiedUseIdx) {
+    RC.push_back(RecurrenceInstr(&MI));
+    return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
+  } else {
+    // If Idx is not TiedUseIdx, check if Idx is commutable with TiedUseIdx.
+    unsigned CommIdx = TargetInstrInfo::CommuteAnyOperandIndex;
+    if (TII->findCommutedOpIndices(MI, Idx, CommIdx) && CommIdx == TiedUseIdx) {
+      RC.push_back(RecurrenceInstr(&MI, Idx, CommIdx));
+      return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
+    }
+  }
+
+  return false;
+}
+
+/// \brief Phi instructions will eventually be lowered to copy instructions. If
+/// phi is in a loop header, a recurrence may formulated around the source and
+/// destination of the phi. For such case commuting operands of the instructions
+/// in the recurrence may enable coalescing of the copy instruction generated
+/// from the phi. For example, if there is a recurrence of
+///
+/// LoopHeader:
+///   %vreg1 = phi(%vreg0, %vreg100)
+/// LoopLatch:
+///   %vreg0<def, tied1> = ADD %vreg2<def, tied0>, %vreg1
+///
+/// , the fact that vreg0 and vreg2 are in the same tied operands set makes
+/// the coalescing of copy instruction generated from the phi in
+/// LoopHeader(i.e. %vreg1 = COPY %vreg0) impossible, because %vreg1 and
+/// %vreg2 have overlapping live range. This introduces additional move
+/// instruction to the final assembly. However, if we commute %vreg2 and
+/// %vreg1 of ADD instruction, the redundant move instruction can be
+/// avoided.
+bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) {
+  SmallSet<unsigned, 2> TargetRegs;
+  for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) {
+    MachineOperand &MO = PHI.getOperand(Idx);
+    assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction");
+    TargetRegs.insert(MO.getReg());
+  }
+
+  bool Changed = false;
+  RecurrenceCycle RC;
+  if (findTargetRecurrence(PHI.getOperand(0).getReg(), TargetRegs, RC)) {
+    // Commutes operands of instructions in RC if necessary so that the copy to
+    // be generated from PHI can be coalesced.
+    DEBUG(dbgs() << "Optimize recurrence chain from " << PHI);
+    for (auto &RI : RC) {
+      DEBUG(dbgs() << "\tInst: " << *(RI.getMI()));
+      auto CP = RI.getCommutePair();
+      if (CP) {
+        Changed = true;
+        TII->commuteInstruction(*(RI.getMI()), false, (*CP).first,
+                                (*CP).second);
+        DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI()));
+      }
+    }
+  }
+
+  return Changed;
+}
+
 bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
@@ -1501,6 +1655,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
+  MLI = &getAnalysis<MachineLoopInfo>();
 
   bool Changed = false;
 
@@ -1529,6 +1684,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     SmallSet<unsigned, 4> CopySrcRegs;
     DenseMap<unsigned, MachineInstr *> CopySrcMIs;
 
+    bool IsLoopHeader = MLI->isLoopHeader(&MBB);
+
     for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
          MII != MIE; ) {
       MachineInstr *MI = &*MII;
@@ -1540,9 +1697,16 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MI->isDebugValue())
           continue;
 
-      if (MI->isPosition() || MI->isPHI())
+      if (MI->isPosition())
         continue;
 
+      if (IsLoopHeader && MI->isPHI()) {
+        if (optimizeRecurrence(*MI)) {
+          Changed = true;
+          continue;
+        }
+      }
+
       if (!MI->isCopy()) {
         for (const auto &Op : MI->operands()) {
           // Visit all operands: definitions can be implicit or explicit.
@@ -1667,7 +1831,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
               MRI->markUsesInDebugValueAsUndef(FoldedReg);
               FoldAsLoadDefCandidates.erase(FoldedReg);
               ++NumLoadFold;
-              
+
               // MI is replaced with FoldMI so we can continue trying to fold
               Changed = true;
               MI = FoldMI;
@@ -1675,7 +1839,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
           }
         }
       }
-      
+
       // If we run into an instruction we can't fold across, discard
       // the load candidates.  Note: We might be able to fold *into* this
       // instruction, so this needs to be after the folding logic.
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 50d241bff23d..9562652556ac 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -2622,7 +2622,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   }
 
   // If we couldn't allocate a register from spilling, there is probably some
-  // invalid inline assembly. The base class wil report it.
+  // invalid inline assembly. The base class will report it.
   if (Stage >= RS_Done || !VirtReg.isSpillable())
     return tryLastChanceRecoloring(VirtReg, Order, NewVRegs, FixedRegisters,
                                    Depth);
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 7b3a5d5c5ff7..ff9bca092dbe 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -979,6 +979,11 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
     IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator());
     for (LiveInterval::SubRange &SR : IntB.subranges())
       SR.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator());
+
+    // If the newly created Instruction has an address of an instruction that was
+    // deleted before (object recycled by the allocator) it needs to be removed from
+    // the deleted list.
+    ErasedInstrs.erase(NewCopyMI);
   } else {
     DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from BB#"
                  << MBB.getNumber() << '\t' << CopyMI);
@@ -989,6 +994,8 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
   // While updating the live-ranges, we only look at slot indices and
   // never go back to the instruction.
   LIS->RemoveMachineInstrFromMaps(CopyMI);
+  // Mark instructions as deleted.
+  ErasedInstrs.insert(&CopyMI);
   CopyMI.eraseFromParent();
 
   // Update the liveness.
@@ -3095,7 +3102,7 @@ copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
       continue;
     // Skip instruction pointers that have already been erased, for example by
     // dead code elimination.
-    if (ErasedInstrs.erase(CurrList[i])) {
+    if (ErasedInstrs.count(CurrList[i])) {
       CurrList[i] = nullptr;
       continue;
     }
diff --git a/lib/CodeGen/RenameIndependentSubregs.cpp b/lib/CodeGen/RenameIndependentSubregs.cpp
index d2eff950d861..bd5ecbd28f29 100644
--- a/lib/CodeGen/RenameIndependentSubregs.cpp
+++ b/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -243,10 +243,14 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes,
 
     unsigned VReg = Intervals[ID]->reg;
     MO.setReg(VReg);
-    if (MO.isTied()) {
+
+    if (MO.isTied() && Reg != VReg) {
       /// Undef use operands are not tracked in the equivalence class but need
       /// to be update if they are tied.
       MO.getParent()->substituteRegister(Reg, VReg, 0, TRI);
+
+      // substituteRegister breaks the iterator, so restart.
+      I = MRI->reg_nodbg_begin(Reg);
     }
   }
   // TODO: We could attempt to recompute new register classes while visiting
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 7dd66d799be4..0f70b0e9ca07 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -1089,7 +1089,7 @@ static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
     // Things that are available after the instruction are killed by it.
     bool IsKill = LiveRegs.available(MRI, Reg);
     MO.setIsKill(IsKill);
-    if (IsKill && addToLiveRegs)
+    if (addToLiveRegs)
       LiveRegs.addReg(Reg);
   }
 }
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d02dcb6f4439..d901af727686 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4915,7 +4915,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
       return SDValue();
 
     // Loads must share the same base address
-    BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr());
+    BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG);
     int64_t ByteOffsetFromBase = 0;
     if (!Base)
       Base = Ptr;
@@ -8210,18 +8210,20 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) &&
       TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
-    if (const ConstantSDNode *CAmt = isConstOrConstSplat(N0.getOperand(1))) {
-      uint64_t Amt = CAmt->getZExtValue();
-      unsigned Size = VT.getScalarSizeInBits();
-
-      if (Amt < Size) {
-        SDLoc SL(N);
-        EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+    SDValue Amt = N0.getOperand(1);
+    KnownBits Known;
+    DAG.computeKnownBits(Amt, Known);
+    unsigned Size = VT.getScalarSizeInBits();
+    if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) {
+      SDLoc SL(N);
+      EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
 
-        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
-        return DAG.getNode(ISD::SHL, SL, VT, Trunc,
-                           DAG.getConstant(Amt, SL, AmtVT));
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
+      if (AmtVT != Amt.getValueType()) {
+        Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
+        AddToWorklist(Amt.getNode());
       }
+      return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
     }
   }
 
@@ -9751,6 +9753,52 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
     }
   }
 
+  // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
+  // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
+  if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() &&
+      (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) &&
+      TLI.isOperationLegal(ISD::FABS, VT)) {
+    SDValue Select = N0, X = N1;
+    if (Select.getOpcode() != ISD::SELECT)
+      std::swap(Select, X);
+
+    SDValue Cond = Select.getOperand(0);
+    auto TrueOpnd  = dyn_cast<ConstantFPSDNode>(Select.getOperand(1));
+    auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2));
+
+    if (TrueOpnd && FalseOpnd &&
+        Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X &&
+        isa<ConstantFPSDNode>(Cond.getOperand(1)) &&
+        cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+      switch (CC) {
+      default: break;
+      case ISD::SETOLT:
+      case ISD::SETULT:
+      case ISD::SETOLE:
+      case ISD::SETULE:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        std::swap(TrueOpnd, FalseOpnd);
+        // Fall through
+      case ISD::SETOGT:
+      case ISD::SETUGT:
+      case ISD::SETOGE:
+      case ISD::SETUGE:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) &&
+            TLI.isOperationLegal(ISD::FNEG, VT))
+          return DAG.getNode(ISD::FNEG, DL, VT,
+                   DAG.getNode(ISD::FABS, DL, VT, X));
+        if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0))
+          return DAG.getNode(ISD::FABS, DL, VT, X);
+
+        break;
+      }
+    }
+  }
+
   // FMUL -> FMA combines:
   if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) {
     AddToWorklist(Fused.getNode());
@@ -12394,7 +12442,7 @@ void DAGCombiner::getStoreMergeCandidates(
     StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
-  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr());
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
   EVT MemVT = St->getMemoryVT();
 
   // We must have a base and an offset.
@@ -12414,8 +12462,8 @@ void DAGCombiner::getStoreMergeCandidates(
   BaseIndexOffset LBasePtr;
   // Match on loadbaseptr if relevant.
   if (IsLoadSrc)
-    LBasePtr =
-        BaseIndexOffset::match(cast<LoadSDNode>(St->getValue())->getBasePtr());
+    LBasePtr = BaseIndexOffset::match(
+        cast<LoadSDNode>(St->getValue())->getBasePtr(), DAG);
 
   auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
                             int64_t &Offset) -> bool {
@@ -12429,7 +12477,7 @@ void DAGCombiner::getStoreMergeCandidates(
     if (IsLoadSrc) {
       // The Load's Base Ptr must also match
       if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Other->getValue())) {
-        auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr());
+        auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG);
         if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
           return false;
       } else
@@ -12443,7 +12491,7 @@ void DAGCombiner::getStoreMergeCandidates(
       if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
             Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR))
         return false;
-    Ptr = BaseIndexOffset::match(Other->getBasePtr());
+    Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);
     return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
   };
   // We looking for a root node which is an ancestor to all mergable
@@ -12786,7 +12834,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       if (Ld->getMemoryVT() != MemVT)
         break;
 
-      BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr());
+      BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
       // If this is not the first ptr that we check.
       int64_t LdOffset = 0;
       if (LdBasePtr.getBase().getNode()) {
@@ -12829,6 +12877,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
     // This variable refers to the size and not index in the array.
     unsigned LastLegalVectorType = 1;
     unsigned LastLegalIntegerType = 1;
+    bool isDereferenceable = true;
     bool DoIntegerTruncate = false;
     StartAddress = LoadNodes[0].OffsetFromBase;
     SDValue FirstChain = FirstLoad->getChain();
@@ -12841,6 +12890,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       if (CurrAddress - StartAddress != (ElementSizeBytes * i))
         break;
       LastConsecutiveLoad = i;
+
+      if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
+        isDereferenceable = false;
+
       // Find a legal type for the vector store.
       EVT StoreTy = EVT::getVectorVT(Context, MemVT, i + 1);
       bool IsFastSt, IsFastLd;
@@ -12926,11 +12979,16 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
     SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
     AddToWorklist(NewStoreChain.getNode());
 
+    MachineMemOperand::Flags MMOFlags = isDereferenceable ? 
+                                          MachineMemOperand::MODereferenceable:
+                                          MachineMemOperand::MONone;
+
     SDValue NewLoad, NewStore;
     if (UseVectorTy || !DoIntegerTruncate) {
       NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
                             FirstLoad->getBasePtr(),
-                            FirstLoad->getPointerInfo(), FirstLoadAlign);
+                            FirstLoad->getPointerInfo(), FirstLoadAlign,
+                            MMOFlags);
       NewStore = DAG.getStore(NewStoreChain, StoreDL, NewLoad,
                               FirstInChain->getBasePtr(),
                               FirstInChain->getPointerInfo(), FirstStoreAlign);
@@ -12940,7 +12998,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       NewLoad =
           DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(),
                          FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
-                         JointMemOpVT, FirstLoadAlign);
+                         JointMemOpVT, FirstLoadAlign, MMOFlags);
       NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
                                    FirstInChain->getBasePtr(),
                                    FirstInChain->getPointerInfo(), JointMemOpVT,
@@ -15013,6 +15071,11 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
   unsigned NumElts = VT.getVectorNumElements();
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
+  unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits();
+
+  if (ExtDstSizeInBits % ExtSrcSizeInBits != 0)
+    return SDValue();
+  unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits;
 
   // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
   // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
@@ -15034,11 +15097,10 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
   if (EltSizeInBits != ExtSrcSizeInBits)
     return SDValue();
 
-  // Attempt to match a 'truncate_vector_inreg' shuffle, we just search for
-  // power-of-2 truncations as they are the most likely.
-  for (unsigned Scale = 2; Scale < NumElts; Scale *= 2)
-    if (isTruncate(Scale))
-      return DAG.getBitcast(VT, N00);
+  // We can remove *extend_vector_inreg only if the truncation happens at
+  // the same scale as the extension.
+  if (isTruncate(ExtScale))
+    return DAG.getBitcast(VT, N00);
 
   return SDValue();
 }
@@ -16540,8 +16602,8 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3;
 
   // Check for BaseIndexOffset matching.
-  BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr());
-  BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr());
+  BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG);
+  BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG);
   int64_t PtrDiff;
   if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
     return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0));
@@ -16751,7 +16813,7 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
-  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr());
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
 
   // We must have a base and an offset.
   if (!BasePtr.getBase().getNode())
@@ -16777,7 +16839,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
       break;
 
     // Find the base pointer and offset for this memory node.
-    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr());
+    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
 
     // Check that the base pointer is the same as the original one.
     if (!BasePtr.equalBaseIndex(Ptr, DAG))
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 75fec7bd1d48..ac3247948169 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1827,11 +1827,10 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
                                    ISD::UADDO : ISD::USUBO,
                                  TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
-  TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT);
-
   if (hasOVF) {
     EVT OvfVT = getSetCCResultType(NVT);
     SDVTList VTList = DAG.getVTList(NVT, OvfVT);
+    TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT);
     int RevOpc;
     if (N->getOpcode() == ISD::ADD) {
       RevOpc = ISD::SUB;
@@ -1864,13 +1863,6 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2));
     SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0],
                                 ISD::SETULT);
-
-    if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) {
-      SDValue Carry = DAG.getZExtOrTrunc(Cmp1, dl, NVT);
-      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry);
-      return;
-    }
-
     SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1,
                                    DAG.getConstant(1, dl, NVT),
                                    DAG.getConstant(0, dl, NVT));
@@ -1885,14 +1877,9 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     SDValue Cmp =
       DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()),
                    LoOps[0], LoOps[1], ISD::SETULT);
-
-    SDValue Borrow;
-    if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent)
-      Borrow = DAG.getZExtOrTrunc(Cmp, dl, NVT);
-    else
-      Borrow = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT),
-                             DAG.getConstant(0, dl, NVT));
-
+    SDValue Borrow = DAG.getSelect(dl, NVT, Cmp,
+                                   DAG.getConstant(1, dl, NVT),
+                                   DAG.getConstant(0, dl, NVT));
     Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow);
   }
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index d2e0dbbf88ec..4e899ae6668e 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -11,6 +11,7 @@
 
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 
@@ -18,28 +19,41 @@ namespace llvm {
 
 bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
                                      const SelectionDAG &DAG, int64_t &Off) {
-  // Obvious equivalent
+  // Initial Offset difference.
   Off = Other.Offset - Offset;
-  if (Other.Base == Base && Other.Index == Index &&
-      Other.IsIndexSignExt == IsIndexSignExt)
-    return true;
 
-  // Match GlobalAddresses
-  if (Index == Other.Index)
-    if (GlobalAddressSDNode *A = dyn_cast<GlobalAddressSDNode>(Base))
-      if (GlobalAddressSDNode *B = dyn_cast<GlobalAddressSDNode>(Other.Base))
+  if ((Other.Index == Index) && (Other.IsIndexSignExt == IsIndexSignExt)) {
+    // Trivial match.
+    if (Other.Base == Base)
+      return true;
+
+    // Match GlobalAddresses
+    if (auto *A = dyn_cast<GlobalAddressSDNode>(Base))
+      if (auto *B = dyn_cast<GlobalAddressSDNode>(Other.Base))
         if (A->getGlobal() == B->getGlobal()) {
           Off += B->getOffset() - A->getOffset();
           return true;
         }
 
-  // TODO: we should be able to add FrameIndex analysis improvements here.
+    const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
 
+    // Match non-equal FrameIndexes - a FrameIndex stemming from an
+    // alloca will not have it's ObjectOffset set until post-DAG and
+    // as such we must assume the two framesIndices are incomparable.
+    if (auto *A = dyn_cast<FrameIndexSDNode>(Base))
+      if (auto *B = dyn_cast<FrameIndexSDNode>(Other.Base))
+        if (!MFI.getObjectAllocation(A->getIndex()) &&
+            !MFI.getObjectAllocation(B->getIndex())) {
+          Off += MFI.getObjectOffset(B->getIndex()) -
+                 MFI.getObjectOffset(A->getIndex());
+          return true;
+        }
+  }
   return false;
 }
 
 /// Parses tree in Ptr for base, index, offset addresses.
-BaseIndexOffset BaseIndexOffset::match(SDValue Ptr) {
+BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) {
   // (((B + I*M) + c)) + c ...
   SDValue Base = Ptr;
   SDValue Index = SDValue();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f9f431db55be..acf68fbbdedf 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3375,7 +3375,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       SDValue IdxN = getValue(Idx);
 
       if (!IdxN.getValueType().isVector() && VectorWidth) {
-        MVT VT = MVT::getVectorVT(IdxN.getValueType().getSimpleVT(), VectorWidth);
+        EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(), VectorWidth);
         IdxN = DAG.getSplatBuildVector(VT, dl, IdxN);
       }
 
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index b1918b19e1df..817e58ce59e1 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -1,4 +1,4 @@
-//===-- TargetPassConfig.cpp - Target independent code generation passes --===//
+//===- TargetPassConfig.cpp - Target independent code generation passes ---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,29 +13,37 @@
 //===---------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/TargetPassConfig.h"
-
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFLAndersAliasAnalysis.h"
 #include "llvm/Analysis/CFLSteensAliasAnalysis.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePassRegistry.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
-#include "llvm/CodeGen/RegisterUsageInfo.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
+#include <cassert>
+#include <string>
 
 using namespace llvm;
 
@@ -225,6 +233,7 @@ char TargetPassConfig::EarlyTailDuplicateID = 0;
 char TargetPassConfig::PostRAMachineLICMID = 0;
 
 namespace {
+
 struct InsertedPass {
   AnalysisID TargetPassID;
   IdentifyingPassPtr InsertedPassID;
@@ -245,9 +254,11 @@ struct InsertedPass {
     return NP;
   }
 };
-}
+
+} // end anonymous namespace
 
 namespace llvm {
+
 class PassConfigImpl {
 public:
   // List of passes explicitly substituted by this target. Normally this is
@@ -263,7 +274,8 @@ public:
   /// is inserted after each instance of the first one.
   SmallVector<InsertedPass, 4> InsertedPasses;
 };
-} // namespace llvm
+
+} // end namespace llvm
 
 // Out of line virtual method.
 TargetPassConfig::~TargetPassConfig() {
@@ -273,11 +285,7 @@ TargetPassConfig::~TargetPassConfig() {
 // Out of line constructor provides default values for pass options and
 // registers all common codegen passes.
 TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm)
-    : ImmutablePass(ID), PM(&pm), Started(true), Stopped(false),
-      AddingMachinePasses(false), TM(&TM), Impl(nullptr), Initialized(false),
-      DisableVerify(false), EnableTailMerge(true),
-      RequireCodeGenSCCOrder(false) {
-
+    : ImmutablePass(ID), PM(&pm), TM(&TM) {
   Impl = new PassConfigImpl();
 
   // Register all target independent codegen passes to activate their PassIDs,
@@ -325,7 +333,7 @@ TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 TargetPassConfig::TargetPassConfig()
-  : ImmutablePass(ID), PM(nullptr) {
+  : ImmutablePass(ID) {
   report_fatal_error("Trying to construct TargetPassConfig without a target "
                      "machine. Scheduling a CodeGen pass without a target "
                      "triple set?");
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 552a89f76ca2..83c00e24d14f 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -68,6 +68,13 @@ EnableRescheduling("twoaddr-reschedule",
                    cl::desc("Coalesce copies by rescheduling (default=true)"),
                    cl::init(true), cl::Hidden);
 
+// Limit the number of dataflow edges to traverse when evaluating the benefit
+// of commuting operands.
+static cl::opt<unsigned> MaxDataFlowEdge(
+    "dataflow-edge-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum number of dataflow edges to traverse when evaluating "
+             "the benefit of commuting operands"));
+
 namespace {
 class TwoAddressInstructionPass : public MachineFunctionPass {
   MachineFunction *MF;
@@ -637,10 +644,10 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
   // To more generally minimize register copies, ideally the logic of two addr
   // instruction pass should be integrated with register allocation pass where
   // interference graph is available.
-  if (isRevCopyChain(regC, regA, 3))
+  if (isRevCopyChain(regC, regA, MaxDataFlowEdge))
     return true;
 
-  if (isRevCopyChain(regB, regA, 3))
+  if (isRevCopyChain(regB, regA, MaxDataFlowEdge))
     return false;
 
   // Since there are no intervening uses for both registers, then commute
diff --git a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
index d058f4864975..e0c7ef58c304 100644
--- a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
@@ -29,10 +29,8 @@ static Error visitKnownRecord(CVSymbol &Record,
   return Error::success();
 }
 
-Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) {
-  if (auto EC = Callbacks.visitSymbolBegin(Record))
-    return EC;
-
+static Error finishVisitation(CVSymbol &Record,
+                              SymbolVisitorCallbacks &Callbacks) {
   switch (Record.Type) {
   default:
     if (auto EC = Callbacks.visitUnknownSymbol(Record))
@@ -55,6 +53,18 @@ Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) {
   return Error::success();
 }
 
+Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) {
+  if (auto EC = Callbacks.visitSymbolBegin(Record))
+    return EC;
+  return finishVisitation(Record, Callbacks);
+}
+
+Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record, uint32_t Offset) {
+  if (auto EC = Callbacks.visitSymbolBegin(Record, Offset))
+    return EC;
+  return finishVisitation(Record, Callbacks);
+}
+
 Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols) {
   for (auto I : Symbols) {
     if (auto EC = visitSymbolRecord(I))
@@ -62,3 +72,13 @@ Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols) {
   }
   return Error::success();
 }
+
+Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols,
+                                         uint32_t InitialOffset) {
+  for (auto I : Symbols) {
+    if (auto EC = visitSymbolRecord(I, InitialOffset))
+      return EC;
+    InitialOffset += I.length();
+  }
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp b/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp
index c31b8d1c96d5..ccc20eb74887 100644
--- a/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp
+++ b/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp
@@ -1,4 +1,4 @@
-//===- DebugChecksumsSubsection.cpp ----------------------*- C++ -*-===//
+//===- DebugChecksumsSubsection.cpp ---------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,10 +8,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
-
-#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <cstring>
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -25,7 +32,7 @@ struct FileChecksumEntryHeader {
                               // Checksum bytes follow.
 };
 
-Error llvm::VarStreamArrayExtractor<FileChecksumEntry>::
+Error VarStreamArrayExtractor<FileChecksumEntry>::
 operator()(BinaryStreamRef Stream, uint32_t &Len, FileChecksumEntry &Item) {
   BinaryStreamReader Reader(Stream);
 
@@ -48,6 +55,7 @@ Error DebugChecksumsSubsectionRef::initialize(BinaryStreamReader Reader) {
 
   return Error::success();
 }
+
 Error DebugChecksumsSubsectionRef::initialize(BinaryStreamRef Section) {
   BinaryStreamReader Reader(Section);
   return initialize(Reader);
diff --git a/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp b/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp
index 21e2cc56075b..cef27787cfd1 100644
--- a/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp
+++ b/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp
@@ -1,4 +1,4 @@
-//===- DebugCrossExSubsection.cpp -------------------------------*- C++ -*-===//
+//===- DebugCrossExSubsection.cpp -----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,8 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/DebugCrossExSubsection.h"
-
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp b/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
index 2c4a0b779342..88c0076915b5 100644
--- a/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
+++ b/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
@@ -1,4 +1,4 @@
-//===- DebugCrossImpSubsection.cpp ------------------------------*- C++ -*-===//
+//===- DebugCrossImpSubsection.cpp ----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,14 +8,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/DebugCrossImpSubsection.h"
-
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cstdint>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::codeview;
 
-namespace llvm {
 Error VarStreamArrayExtractor<CrossModuleImportItem>::
 operator()(BinaryStreamRef Stream, uint32_t &Len,
            codeview::CrossModuleImportItem &Item) {
@@ -34,7 +41,6 @@ operator()(BinaryStreamRef Stream, uint32_t &Len,
     return EC;
   return Error::success();
 }
-}
 
 Error DebugCrossModuleImportsSubsectionRef::initialize(
     BinaryStreamReader Reader) {
diff --git a/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp b/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
index e7719d05dbdc..077c103a615b 100644
--- a/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
+++ b/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
@@ -1,4 +1,4 @@
-//===- DebugInlineeLinesSubsection.cpp ------------------------*- C++-*-===//
+//===- DebugInlineeLinesSubsection.cpp ------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,11 +8,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
-
-#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp b/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp
index fbcad61d60a6..57ad40819fbc 100644
--- a/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp
+++ b/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp
@@ -1,4 +1,4 @@
-//===- DebugLinesSubsection.cpp -------------------------------*- C++-*-===//
+//===- DebugLinesSubsection.cpp -------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,18 +8,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
-
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Error.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 using namespace llvm::codeview;
 
 Error LineColumnExtractor::operator()(BinaryStreamRef Stream, uint32_t &Len,
                                       LineColumnEntry &Item) {
-  using namespace codeview;
   const LineBlockFragmentHeader *BlockHeader;
   BinaryStreamReader Reader(Stream);
   if (auto EC = Reader.readObject(BlockHeader))
diff --git a/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp b/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
index de02525270c4..d723282eb715 100644
--- a/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
+++ b/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
@@ -1,4 +1,4 @@
-//===- DebugStringTableSubsection.cpp - CodeView String Table ---*- C++ -*-===//
+//===- DebugStringTableSubsection.cpp - CodeView String Table -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,10 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
-
-#include "llvm/Support/BinaryStream.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -23,6 +27,7 @@ Error DebugStringTableSubsectionRef::initialize(BinaryStreamRef Contents) {
   Stream = Contents;
   return Error::success();
 }
+
 Error DebugStringTableSubsectionRef::initialize(BinaryStreamReader &Reader) {
   return Reader.readStreamRef(Stream);
 }
diff --git a/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
index d69eca018e0c..55f343c11e7f 100644
--- a/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
+++ b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
@@ -1,4 +1,4 @@
-//===- DebugSubsectionRecord.cpp -----------------------------*- C++-*-===//
+//===- DebugSubsectionRecord.cpp ------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,16 +8,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
-
 #include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 using namespace llvm::codeview;
 
-DebugSubsectionRecord::DebugSubsectionRecord()
-    : Container(CodeViewContainer::ObjectFile),
-      Kind(DebugSubsectionKind::None) {}
+DebugSubsectionRecord::DebugSubsectionRecord() = default;
 
 DebugSubsectionRecord::DebugSubsectionRecord(DebugSubsectionKind Kind,
                                              BinaryStreamRef Data,
diff --git a/lib/DebugInfo/CodeView/DebugSymbolRVASubsection.cpp b/lib/DebugInfo/CodeView/DebugSymbolRVASubsection.cpp
index 5f91b68f3ad8..60fbf9d747b2 100644
--- a/lib/DebugInfo/CodeView/DebugSymbolRVASubsection.cpp
+++ b/lib/DebugInfo/CodeView/DebugSymbolRVASubsection.cpp
@@ -1,4 +1,4 @@
-//===- DebugSymbolRVASubsection.cpp ------------------------------*- C++-*-===//
+//===- DebugSymbolRVASubsection.cpp ---------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,6 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/DebugSymbolRVASubsection.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include <cstdint>
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/lib/DebugInfo/CodeView/EnumTables.cpp b/lib/DebugInfo/CodeView/EnumTables.cpp
index ec00af28395e..4cfb55a31b35 100644
--- a/lib/DebugInfo/CodeView/EnumTables.cpp
+++ b/lib/DebugInfo/CodeView/EnumTables.cpp
@@ -1,4 +1,4 @@
-//===- EnumTables.cpp - Enum to string conversion tables --------*- C++ -*-===//
+//===- EnumTables.cpp - Enum to string conversion tables ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,6 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include <type_traits>
 
 using namespace llvm;
 using namespace codeview;
@@ -333,6 +335,7 @@ static const EnumEntry<COFF::SectionCharacteristics>
 
 namespace llvm {
 namespace codeview {
+
 ArrayRef<EnumEntry<SymbolKind>> getSymbolTypeNames() {
   return makeArrayRef(SymbolTypeNames);
 }
@@ -348,48 +351,63 @@ ArrayRef<EnumEntry<uint16_t>> getRegisterNames() {
 ArrayRef<EnumEntry<uint32_t>> getPublicSymFlagNames() {
   return makeArrayRef(PublicSymFlagNames);
 }
+
 ArrayRef<EnumEntry<uint8_t>> getProcSymFlagNames() {
   return makeArrayRef(ProcSymFlagNames);
 }
+
 ArrayRef<EnumEntry<uint16_t>> getLocalFlagNames() {
   return makeArrayRef(LocalFlags);
 }
+
 ArrayRef<EnumEntry<uint8_t>> getFrameCookieKindNames() {
   return makeArrayRef(FrameCookieKinds);
 }
+
 ArrayRef<EnumEntry<SourceLanguage>> getSourceLanguageNames() {
   return makeArrayRef(SourceLanguages);
 }
+
 ArrayRef<EnumEntry<uint32_t>> getCompileSym2FlagNames() {
   return makeArrayRef(CompileSym2FlagNames);
 }
+
 ArrayRef<EnumEntry<uint32_t>> getCompileSym3FlagNames() {
   return makeArrayRef(CompileSym3FlagNames);
 }
+
 ArrayRef<EnumEntry<uint32_t>> getFileChecksumNames() {
   return makeArrayRef(FileChecksumNames);
 }
+
 ArrayRef<EnumEntry<unsigned>> getCPUTypeNames() {
   return makeArrayRef(CPUTypeNames);
 }
+
 ArrayRef<EnumEntry<uint32_t>> getFrameProcSymFlagNames() {
   return makeArrayRef(FrameProcSymFlagNames);
 }
+
 ArrayRef<EnumEntry<uint16_t>> getExportSymFlagNames() {
   return makeArrayRef(ExportSymFlagNames);
 }
+
 ArrayRef<EnumEntry<uint32_t>> getModuleSubstreamKindNames() {
   return makeArrayRef(ModuleSubstreamKindNames);
 }
+
 ArrayRef<EnumEntry<uint8_t>> getThunkOrdinalNames() {
   return makeArrayRef(ThunkOrdinalNames);
 }
+
 ArrayRef<EnumEntry<uint16_t>> getTrampolineNames() {
   return makeArrayRef(TrampolineNames);
 }
+
 ArrayRef<EnumEntry<COFF::SectionCharacteristics>>
 getImageSectionCharacteristicNames() {
   return makeArrayRef(ImageSectionCharacteristicNames);
 }
-}
-}
+
+} // end namespace codeview
+} // end namespace llvm
diff --git a/lib/DebugInfo/CodeView/Formatters.cpp b/lib/DebugInfo/CodeView/Formatters.cpp
index ef00bd8570fa..1fa8d219d6ac 100644
--- a/lib/DebugInfo/CodeView/Formatters.cpp
+++ b/lib/DebugInfo/CodeView/Formatters.cpp
@@ -1,4 +1,4 @@
-//===- Formatters.cpp -------------------------------------------*- C++ -*-===//
+//===- Formatters.cpp -----------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,6 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/Formatters.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -19,7 +23,7 @@ GuidAdapter::GuidAdapter(StringRef Guid)
 GuidAdapter::GuidAdapter(ArrayRef<uint8_t> Guid)
     : FormatAdapter(std::move(Guid)) {}
 
-void GuidAdapter::format(llvm::raw_ostream &Stream, StringRef Style) {
+void GuidAdapter::format(raw_ostream &Stream, StringRef Style) {
   static const char *Lookup = "0123456789ABCDEF";
 
   assert(Item.size() == 16 && "Expected 16-byte GUID");
diff --git a/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index 20f7e72c3af3..5aaf3f1453a8 100644
--- a/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -1,4 +1,4 @@
-//===- LazyRandomTypeCollection.cpp ---------------------------- *- C++--*-===//
+//===- LazyRandomTypeCollection.cpp ---------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,12 +8,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
-
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/TypeName.h"
-#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/lib/DebugInfo/CodeView/StringsAndChecksums.cpp b/lib/DebugInfo/CodeView/StringsAndChecksums.cpp
index 928bf8c94f73..306af1d1ef6b 100644
--- a/lib/DebugInfo/CodeView/StringsAndChecksums.cpp
+++ b/lib/DebugInfo/CodeView/StringsAndChecksums.cpp
@@ -1,4 +1,4 @@
-//===- StringsAndChecksums.cpp ----------------------------------*- C++ -*-===//
+//===- StringsAndChecksums.cpp --------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,14 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/Support/Error.h"
+#include <cassert>
 
 using namespace llvm;
 using namespace llvm::codeview;
 
-StringsAndChecksumsRef::StringsAndChecksumsRef() {}
+StringsAndChecksumsRef::StringsAndChecksumsRef() = default;
 
 StringsAndChecksumsRef::StringsAndChecksumsRef(
     const DebugStringTableSubsectionRef &Strings)
diff --git a/lib/DebugInfo/CodeView/SymbolSerializer.cpp b/lib/DebugInfo/CodeView/SymbolSerializer.cpp
index 9f2d619d1a1c..9a2e776feb75 100644
--- a/lib/DebugInfo/CodeView/SymbolSerializer.cpp
+++ b/lib/DebugInfo/CodeView/SymbolSerializer.cpp
@@ -1,4 +1,4 @@
-//===- SymbolSerializer.cpp -------------------------------------*- C++ -*-===//
+//===- SymbolSerializer.cpp -----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,6 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <cassert>
+#include <cstdint>
+#include <cstring>
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -15,7 +22,7 @@ using namespace llvm::codeview;
 SymbolSerializer::SymbolSerializer(BumpPtrAllocator &Allocator,
                                    CodeViewContainer Container)
     : Storage(Allocator), RecordBuffer(MaxRecordLength),
-      Stream(RecordBuffer, llvm::support::little), Writer(Stream),
+      Stream(RecordBuffer, support::little), Writer(Stream),
       Mapping(Writer, Container) {}
 
 Error SymbolSerializer::visitSymbolBegin(CVSymbol &Record) {
diff --git a/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp b/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
index 1226d5be3f3c..72cb9e2e3544 100644
--- a/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
+++ b/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
@@ -438,6 +438,25 @@ void llvm::codeview::discoverTypeIndices(const CVType &Type,
   ::discoverTypeIndices(Type.content(), Type.kind(), Refs);
 }
 
+void llvm::codeview::discoverTypeIndices(const CVType &Type,
+                                         SmallVectorImpl<TypeIndex> &Indices) {
+
+  Indices.clear();
+
+  SmallVector<TiReference, 4> Refs;
+  discoverTypeIndices(Type, Refs);
+  if (Refs.empty())
+    return;
+
+  BinaryStreamReader Reader(Type.content(), support::little);
+  for (const auto &Ref : Refs) {
+    Reader.setOffset(Ref.Offset);
+    FixedStreamArray<TypeIndex> Run;
+    cantFail(Reader.readArray(Run, Ref.Count));
+    Indices.append(Run.begin(), Run.end());
+  }
+}
+
 void llvm::codeview::discoverTypeIndices(ArrayRef<uint8_t> RecordData,
                                          SmallVectorImpl<TiReference> &Refs) {
   const RecordPrefix *P =
diff --git a/lib/DebugInfo/CodeView/TypeSerializer.cpp b/lib/DebugInfo/CodeView/TypeSerializer.cpp
index 93c1198e36ce..003c13b4a20d 100644
--- a/lib/DebugInfo/CodeView/TypeSerializer.cpp
+++ b/lib/DebugInfo/CodeView/TypeSerializer.cpp
@@ -1,4 +1,4 @@
-//===- TypeSerialzier.cpp ---------------------------------------*- C++ -*-===//
+//===- TypeSerialzier.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,16 +8,27 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
-
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
-
-#include <string.h>
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
 
 using namespace llvm;
 using namespace llvm::codeview;
 
 namespace {
+
 struct HashedType {
   uint64_t Hash;
   const uint8_t *Data;
@@ -30,20 +41,26 @@ struct HashedType {
 struct HashedTypePtr {
   HashedTypePtr() = default;
   HashedTypePtr(HashedType *Ptr) : Ptr(Ptr) {}
+
   HashedType *Ptr = nullptr;
 };
-} // namespace
+
+} // end anonymous namespace
 
 namespace llvm {
+
 template <> struct DenseMapInfo<HashedTypePtr> {
   static inline HashedTypePtr getEmptyKey() { return HashedTypePtr(nullptr); }
+
   static inline HashedTypePtr getTombstoneKey() {
     return HashedTypePtr(reinterpret_cast<HashedType *>(1));
   }
+
   static unsigned getHashValue(HashedTypePtr Val) {
     assert(Val.Ptr != getEmptyKey().Ptr && Val.Ptr != getTombstoneKey().Ptr);
     return Val.Ptr->Hash;
   }
+
   static bool isEqual(HashedTypePtr LHSP, HashedTypePtr RHSP) {
     HashedType *LHS = LHSP.Ptr;
     HashedType *RHS = RHSP.Ptr;
@@ -54,7 +71,8 @@ template <> struct DenseMapInfo<HashedTypePtr> {
     return ::memcmp(LHS->Data, RHS->Data, LHS->Size) == 0;
   }
 };
-}
+
+} // end namespace llvm
 
 /// Private implementation so that we don't leak our DenseMap instantiations to
 /// users.
@@ -159,13 +177,13 @@ TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
 
 TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage, bool Hash)
     : RecordStorage(Storage), RecordBuffer(MaxRecordLength * 2),
-      Stream(RecordBuffer, llvm::support::little), Writer(Stream),
+      Stream(RecordBuffer, support::little), Writer(Stream),
       Mapping(Writer) {
   // RecordBuffer needs to be able to hold enough data so that if we are 1
   // byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes,
   // we won't overflow.
   if (Hash)
-    Hasher = make_unique<TypeHasher>(Storage);
+    Hasher = llvm::make_unique<TypeHasher>(Storage);
 }
 
 TypeSerializer::~TypeSerializer() = default;
@@ -331,7 +349,7 @@ Error TypeSerializer::visitMemberEnd(CVMemberRecord &Record) {
 
     uint8_t *SegmentBytes = RecordStorage.Allocate<uint8_t>(LengthWithSize);
     auto SavedSegment = MutableArrayRef<uint8_t>(SegmentBytes, LengthWithSize);
-    MutableBinaryByteStream CS(SavedSegment, llvm::support::little);
+    MutableBinaryByteStream CS(SavedSegment, support::little);
     BinaryStreamWriter CW(CS);
     if (auto EC = CW.writeBytes(CopyData))
       return EC;
diff --git a/lib/DebugInfo/DWARF/CMakeLists.txt b/lib/DebugInfo/DWARF/CMakeLists.txt
index 6ca6e64bd8e6..11f94509e8fa 100644
--- a/lib/DebugInfo/DWARF/CMakeLists.txt
+++ b/lib/DebugInfo/DWARF/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMDebugInfoDWARF
   DWARFAcceleratorTable.cpp
   DWARFCompileUnit.cpp
   DWARFContext.cpp
+  DWARFDataExtractor.cpp
   DWARFDebugAbbrev.cpp
   DWARFDebugArangeSet.cpp
   DWARFDebugAranges.cpp
diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 87009bf1b6a1..9ae7c9a07f76 100644
--- a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -62,6 +62,45 @@ uint32_t DWARFAcceleratorTable::getHeaderDataLength() {
   return Hdr.HeaderDataLength;
 }
 
+ArrayRef<std::pair<DWARFAcceleratorTable::HeaderData::AtomType,
+                   DWARFAcceleratorTable::HeaderData::Form>>
+DWARFAcceleratorTable::getAtomsDesc() {
+  return HdrData.Atoms;
+}
+
+bool DWARFAcceleratorTable::validateForms() {
+  for (auto Atom : getAtomsDesc()) {
+    DWARFFormValue FormValue(Atom.second);
+    switch (Atom.first) {
+    case dwarf::DW_ATOM_die_offset:
+      if ((!FormValue.isFormClass(DWARFFormValue::FC_Constant) &&
+           !FormValue.isFormClass(DWARFFormValue::FC_Flag)) ||
+          FormValue.getForm() == dwarf::DW_FORM_sdata)
+        return false;
+    default:
+      break;
+    }
+  }
+  return true;
+}
+
+uint32_t DWARFAcceleratorTable::readAtoms(uint32_t &HashDataOffset) {
+  uint32_t DieOffset = dwarf::DW_INVALID_OFFSET;
+
+  for (auto Atom : getAtomsDesc()) {
+    DWARFFormValue FormValue(Atom.second);
+    FormValue.extractValue(AccelSection, &HashDataOffset, NULL);
+    switch (Atom.first) {
+    case dwarf::DW_ATOM_die_offset:
+      DieOffset = *FormValue.getAsUnsignedConstant();
+      break;
+    default:
+      break;
+    }
+  }
+  return DieOffset;
+}
+
 LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
   // Dump the header.
   OS << "Magic = " << format("0x%08x", Hdr.Magic) << '\n'
@@ -121,8 +160,7 @@ LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
         continue;
       }
       while (AccelSection.isValidOffsetForDataOfSize(DataOffset, 4)) {
-        unsigned StringOffset =
-            getRelocatedValue(AccelSection, 4, &DataOffset, &Relocs);
+        unsigned StringOffset = AccelSection.getRelocatedValue(4, &DataOffset);
         if (!StringOffset)
           break;
         OS << format("    Name: %08x \"%s\"\n", StringOffset,
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 381479461750..a18d4efec07a 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -59,26 +59,13 @@ using DWARFLineTable = DWARFDebugLine::LineTable;
 using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind;
 using FunctionNameKind = DILineInfoSpecifier::FunctionNameKind;
 
-uint64_t llvm::getRelocatedValue(const DataExtractor &Data, uint32_t Size,
-                                 uint32_t *Off, const RelocAddrMap *Relocs,
-                                 uint64_t *SectionIndex) {
-  if (!Relocs)
-    return Data.getUnsigned(Off, Size);
-  RelocAddrMap::const_iterator AI = Relocs->find(*Off);
-  if (AI == Relocs->end())
-    return Data.getUnsigned(Off, Size);
-  if (SectionIndex)
-    *SectionIndex = AI->second.SectionIndex;
-  return Data.getUnsigned(Off, Size) + AI->second.Value;
-}
-
 static void dumpAccelSection(raw_ostream &OS, StringRef Name,
                              const DWARFSection& Section, StringRef StringSection,
                              bool LittleEndian) {
-  DataExtractor AccelSection(Section.Data, LittleEndian, 0);
+  DWARFDataExtractor AccelSection(Section, LittleEndian, 0);
   DataExtractor StrData(StringSection, LittleEndian, 0);
   OS << "\n." << Name << " contents:\n";
-  DWARFAcceleratorTable Accel(AccelSection, StrData, Section.Relocs);
+  DWARFAcceleratorTable Accel(AccelSection, StrData);
   if (!Accel.extract())
     return;
   Accel.dump(OS);
@@ -88,7 +75,7 @@ static void
 dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName,
                                 const DWARFSection &StringOffsetsSection,
                                 StringRef StringSection, bool LittleEndian) {
-  DataExtractor StrOffsetExt(StringOffsetsSection.Data, LittleEndian, 0);
+  DWARFDataExtractor StrOffsetExt(StringOffsetsSection, LittleEndian, 0);
   uint32_t Offset = 0;
   uint64_t SectionSize = StringOffsetsSection.Data.size();
 
@@ -144,8 +131,8 @@ dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName,
     while (Offset - ContributionBase < ContributionSize) {
       OS << format("0x%8.8x: ", Offset);
       // FIXME: We can only extract strings in DWARF32 format at the moment.
-      uint64_t StringOffset = getRelocatedValue(
-          StrOffsetExt, EntrySize, &Offset, &StringOffsetsSection.Relocs);
+      uint64_t StringOffset =
+          StrOffsetExt.getRelocatedValue(EntrySize, &Offset);
       if (Format == DWARF32) {
         OS << format("%8.8x ", StringOffset);
         uint32_t StringOffset32 = (uint32_t)StringOffset;
@@ -287,11 +274,11 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
       if (!CUDIE)
         continue;
       if (auto StmtOffset = toSectionOffset(CUDIE.find(DW_AT_stmt_list))) {
-        DataExtractor lineData(getLineSection().Data, isLittleEndian(),
-                               savedAddressByteSize);
+        DWARFDataExtractor lineData(getLineSection(), isLittleEndian(),
+                                    savedAddressByteSize);
         DWARFDebugLine::LineTable LineTable;
         uint32_t Offset = *StmtOffset;
-        LineTable.parse(lineData, &getLineSection().Relocs, &Offset);
+        LineTable.parse(lineData, &Offset);
         LineTable.dump(OS);
       }
     }
@@ -310,8 +297,8 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
   if (DumpType == DIDT_All || DumpType == DIDT_LineDwo) {
     OS << "\n.debug_line.dwo contents:\n";
     unsigned stmtOffset = 0;
-    DataExtractor lineData(getLineDWOSection().Data, isLittleEndian(),
-                           savedAddressByteSize);
+    DWARFDataExtractor lineData(getLineDWOSection(), isLittleEndian(),
+                                savedAddressByteSize);
     DWARFDebugLine::LineTable LineTable;
     while (LineTable.Prologue.parse(lineData, &stmtOffset)) {
       LineTable.dump(OS);
@@ -348,11 +335,11 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
     // sizes, but for simplicity we just use the address byte size of the last
     // compile unit (there is no easy and fast way to associate address range
     // list and the compile unit it describes).
-    DataExtractor rangesData(getRangeSection().Data, isLittleEndian(),
-                             savedAddressByteSize);
+    DWARFDataExtractor rangesData(getRangeSection(), isLittleEndian(),
+                                  savedAddressByteSize);
     offset = 0;
     DWARFDebugRangeList rangeList;
-    while (rangeList.extract(rangesData, &offset, getRangeSection().Relocs))
+    while (rangeList.extract(rangesData, &offset))
       rangeList.dump(OS);
   }
 
@@ -499,11 +486,13 @@ const DWARFDebugLoc *DWARFContext::getDebugLoc() {
   if (Loc)
     return Loc.get();
 
-  DataExtractor LocData(getLocSection().Data, isLittleEndian(), 0);
-  Loc.reset(new DWARFDebugLoc(getLocSection().Relocs));
+  Loc.reset(new DWARFDebugLoc);
   // assume all compile units have the same address byte size
-  if (getNumCompileUnits())
-    Loc->parse(LocData, getCompileUnitAtIndex(0)->getAddressByteSize());
+  if (getNumCompileUnits()) {
+    DWARFDataExtractor LocData(getLocSection(), isLittleEndian(),
+                               getCompileUnitAtIndex(0)->getAddressByteSize());
+    Loc->parse(LocData);
+  }
   return Loc.get();
 }
 
@@ -570,7 +559,7 @@ const DWARFDebugMacro *DWARFContext::getDebugMacro() {
 const DWARFLineTable *
 DWARFContext::getLineTableForUnit(DWARFUnit *U) {
   if (!Line)
-    Line.reset(new DWARFDebugLine(&getLineSection().Relocs));
+    Line.reset(new DWARFDebugLine);
 
   auto UnitDIE = U->getUnitDIE();
   if (!UnitDIE)
@@ -586,12 +575,12 @@ DWARFContext::getLineTableForUnit(DWARFUnit *U) {
     return lt;
 
   // Make sure the offset is good before we try to parse.
-  if (stmtOffset >= U->getLineSection().size())
+  if (stmtOffset >= U->getLineSection().Data.size())
     return nullptr;  
 
   // We have to parse it first.
-  DataExtractor lineData(U->getLineSection(), isLittleEndian(),
-                         U->getAddressByteSize());
+  DWARFDataExtractor lineData(U->getLineSection(), isLittleEndian(),
+                              U->getAddressByteSize());
   return Line->getOrParseLineTable(lineData, stmtOffset);
 }
 
@@ -870,13 +859,13 @@ static Expected<SymInfo> getSymbolInfo(const object::ObjectFile &Obj,
 
     Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
     if (!SymAddrOrErr)
-      return createError("error: failed to compute symbol address: ",
+      return createError("failed to compute symbol address: ",
                          SymAddrOrErr.takeError());
 
     // Also remember what section this symbol is in for later
     auto SectOrErr = Sym->getSection();
     if (!SectOrErr)
-      return createError("error: failed to get symbol section: ",
+      return createError("failed to get symbol section: ",
                          SectOrErr.takeError());
 
     RSec = *SectOrErr;
@@ -937,8 +926,14 @@ Error DWARFContextInMemory::maybeDecompress(const SectionRef &Sec,
   return Error::success();
 }
 
-DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
-                                           const LoadedObjectInfo *L)
+ErrorPolicy DWARFContextInMemory::defaultErrorHandler(Error E) {
+  errs() << "error: " + toString(std::move(E)) << '\n';
+  return ErrorPolicy::Continue;
+}
+
+DWARFContextInMemory::DWARFContextInMemory(
+    const object::ObjectFile &Obj, const LoadedObjectInfo *L,
+    function_ref<ErrorPolicy(Error)> HandleError)
     : FileName(Obj.getFileName()), IsLittleEndian(Obj.isLittleEndian()),
       AddressSize(Obj.getBytesInAddress()) {
   for (const SectionRef &Section : Obj.sections()) {
@@ -961,9 +956,10 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
       Section.getContents(data);
 
     if (auto Err = maybeDecompress(Section, name, data)) {
-      errs() << "error: failed to decompress '" + name + "', " +
-                    toString(std::move(Err))
-             << '\n';
+      ErrorPolicy EP = HandleError(
+          createError("failed to decompress '" + name + "', ", std::move(Err)));
+      if (EP == ErrorPolicy::Halt)
+        return;
       continue;
     }
 
@@ -1055,7 +1051,8 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
 
       Expected<SymInfo> SymInfoOrErr = getSymbolInfo(Obj, Reloc, L, AddrCache);
       if (!SymInfoOrErr) {
-        errs() << toString(SymInfoOrErr.takeError()) << '\n';
+        if (HandleError(SymInfoOrErr.takeError()) == ErrorPolicy::Halt)
+          return;
         continue;
       }
 
@@ -1064,7 +1061,11 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
       if (V.error()) {
         SmallString<32> Name;
         Reloc.getTypeName(Name);
-        errs() << "error: failed to compute relocation: " << Name << "\n";
+        ErrorPolicy EP = HandleError(
+            createError("failed to compute relocation: " + Name + ", ",
+                        errorCodeToError(object_error::parse_failed)));
+        if (EP == ErrorPolicy::Halt)
+          return;
         continue;
       }
       RelocAddrEntry Rel = {SymInfoOrErr->SectionIndex, Val};
diff --git a/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp b/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
new file mode 100644
index 000000000000..001097e56c71
--- /dev/null
+++ b/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
@@ -0,0 +1,24 @@
+//===- DWARFDataExtractor.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+
+using namespace llvm;
+
+uint64_t DWARFDataExtractor::getRelocatedValue(uint32_t Size, uint32_t *Off,
+                                               uint64_t *SecNdx) const {
+  if (!RelocMap)
+    return getUnsigned(Off, Size);
+  RelocAddrMap::const_iterator AI = RelocMap->find(*Off);
+  if (AI == RelocMap->end())
+    return getUnsigned(Off, Size);
+  if (SecNdx)
+    *SecNdx = AI->second.SectionIndex;
+  return getUnsigned(Off, Size) + AI->second.Value;
+}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index 1551974b822a..976bc4651ae6 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -21,13 +21,13 @@ using namespace dwarf;
 
 bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U,
                                              uint32_t *OffsetPtr) {
-  DataExtractor DebugInfoData = U.getDebugInfoExtractor();
+  DWARFDataExtractor DebugInfoData = U.getDebugInfoExtractor();
   const uint32_t UEndOffset = U.getNextUnitOffset();
   return extractFast(U, OffsetPtr, DebugInfoData, UEndOffset, 0);
 }
 
 bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint32_t *OffsetPtr,
-                                      const DataExtractor &DebugInfoData,
+                                      const DWARFDataExtractor &DebugInfoData,
                                       uint32_t UEndOffset, uint32_t D) {
   Offset = *OffsetPtr;
   Depth = D;
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index ad5647f3e03d..7d180564e9f7 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -94,8 +94,8 @@ void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
 
 // Parse v2-v4 directory and file tables.
 static void
-parseV2DirFileTables(DataExtractor DebugLineData, uint32_t *OffsetPtr,
-                     uint64_t EndPrologueOffset,
+parseV2DirFileTables(const DWARFDataExtractor &DebugLineData,
+                     uint32_t *OffsetPtr, uint64_t EndPrologueOffset,
                      std::vector<StringRef> &IncludeDirectories,
                      std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
   while (*OffsetPtr < EndPrologueOffset) {
@@ -122,7 +122,7 @@ parseV2DirFileTables(DataExtractor DebugLineData, uint32_t *OffsetPtr,
 // Returns the descriptors, or an empty vector if we did not find a path or
 // ran off the end of the prologue.
 static ContentDescriptors
-parseV5EntryFormat(DataExtractor DebugLineData, uint32_t *OffsetPtr,
+parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
                    uint64_t EndPrologueOffset) {
   ContentDescriptors Descriptors;
   int FormatCount = DebugLineData.getU8(OffsetPtr);
@@ -142,8 +142,8 @@ parseV5EntryFormat(DataExtractor DebugLineData, uint32_t *OffsetPtr,
 }
 
 static bool
-parseV5DirFileTables(DataExtractor DebugLineData, uint32_t *OffsetPtr,
-                     uint64_t EndPrologueOffset,
+parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
+                     uint32_t *OffsetPtr, uint64_t EndPrologueOffset,
                      const DWARFFormParams &FormParams,
                      std::vector<StringRef> &IncludeDirectories,
                      std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
@@ -212,7 +212,7 @@ parseV5DirFileTables(DataExtractor DebugLineData, uint32_t *OffsetPtr,
   return true;
 }
 
-bool DWARFDebugLine::Prologue::parse(DataExtractor DebugLineData,
+bool DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
                                      uint32_t *OffsetPtr) {
   const uint64_t PrologueOffset = *OffsetPtr;
 
@@ -381,20 +381,19 @@ DWARFDebugLine::getLineTable(uint32_t Offset) const {
 }
 
 const DWARFDebugLine::LineTable *
-DWARFDebugLine::getOrParseLineTable(DataExtractor DebugLineData,
+DWARFDebugLine::getOrParseLineTable(const DWARFDataExtractor &DebugLineData,
                                     uint32_t Offset) {
   std::pair<LineTableIter, bool> Pos =
       LineTableMap.insert(LineTableMapTy::value_type(Offset, LineTable()));
   LineTable *LT = &Pos.first->second;
   if (Pos.second) {
-    if (!LT->parse(DebugLineData, RelocMap, &Offset))
+    if (!LT->parse(DebugLineData, &Offset))
       return nullptr;
   }
   return LT;
 }
 
-bool DWARFDebugLine::LineTable::parse(DataExtractor DebugLineData,
-                                      const RelocAddrMap *RMap,
+bool DWARFDebugLine::LineTable::parse(const DWARFDataExtractor &DebugLineData,
                                       uint32_t *OffsetPtr) {
   const uint32_t DebugLineOffset = *OffsetPtr;
 
@@ -443,8 +442,7 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor DebugLineData,
         // relocatable address. All of the other statement program opcodes
         // that affect the address register add a delta to it. This instruction
         // stores a relocatable value into it instead.
-        State.Row.Address = getRelocatedValue(
-            DebugLineData, DebugLineData.getAddressSize(), OffsetPtr, RMap);
+        State.Row.Address = DebugLineData.getRelocatedAddress(OffsetPtr);
         break;
 
       case DW_LNE_define_file:
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index 2178bef65d1d..c240dd7406d9 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -40,9 +40,9 @@ void DWARFDebugLoc::dump(raw_ostream &OS) const {
   }
 }
 
-void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) {
+void DWARFDebugLoc::parse(const DWARFDataExtractor &data) {
   uint32_t Offset = 0;
-  while (data.isValidOffset(Offset+AddressSize-1)) {
+  while (data.isValidOffset(Offset+data.getAddressSize()-1)) {
     Locations.resize(Locations.size() + 1);
     LocationList &Loc = Locations.back();
     Loc.Offset = Offset;
@@ -51,8 +51,8 @@ void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) {
     while (true) {
       // A beginning and ending address offsets.
       Entry E;
-      E.Begin = getRelocatedValue(data, AddressSize, &Offset, &RelocMap);
-      E.End = getRelocatedValue(data, AddressSize, &Offset, &RelocMap);
+      E.Begin = data.getRelocatedAddress(&Offset);
+      E.End = data.getRelocatedAddress(&Offset);
 
       // The end of any given location list is marked by an end of list entry,
       // which consists of a 0 for the beginning address offset and a 0 for the
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index 43201293fe60..0b6ae86fd94b 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -23,8 +23,8 @@ void DWARFDebugRangeList::clear() {
   Entries.clear();
 }
 
-bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr,
-                                  const RelocAddrMap &Relocs) {
+bool DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
+                                  uint32_t *offset_ptr) {
   clear();
   if (!data.isValidOffset(*offset_ptr))
     return false;
@@ -35,10 +35,9 @@ bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr,
   while (true) {
     RangeListEntry entry;
     uint32_t prev_offset = *offset_ptr;
-    entry.StartAddress = getRelocatedValue(data, AddressSize, offset_ptr,
-                                           &Relocs, &entry.SectionIndex);
-    entry.EndAddress =
-        getRelocatedValue(data, AddressSize, offset_ptr, &Relocs);
+    entry.StartAddress =
+        data.getRelocatedAddress(offset_ptr, &entry.SectionIndex);
+    entry.EndAddress = data.getRelocatedAddress(offset_ptr);
 
     // Check that both values were extracted correctly.
     if (*offset_ptr != prev_offset + 2 * AddressSize) {
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index b4b682dd11b5..ef416f72ad17 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -308,7 +308,7 @@ void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth, unsigned Indent,
                     DIDumpOptions DumpOpts) const {
   if (!isValid())
     return;
-  DataExtractor debug_info_data = U->getDebugInfoExtractor();
+  DWARFDataExtractor debug_info_data = U->getDebugInfoExtractor();
   const uint32_t Offset = getOffset();
   uint32_t offset = Offset;
   
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 861114bde1f2..83a7792e1244 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -275,7 +275,7 @@ bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
          FC == FC_SectionOffset;
 }
 
-bool DWARFFormValue::extractValue(const DataExtractor &Data,
+bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
                                   uint32_t *OffsetPtr, const DWARFUnit *CU) {
   U = CU;
   bool Indirect = false;
@@ -290,10 +290,9 @@ bool DWARFFormValue::extractValue(const DataExtractor &Data,
     case DW_FORM_ref_addr: {
       if (!U)
         return false;
-      uint16_t AddrSize = (Form == DW_FORM_addr) ? U->getAddressByteSize()
-                                                 : U->getRefAddrByteSize();
-      Value.uval = getRelocatedValue(Data, AddrSize, OffsetPtr,
-                                     U->getRelocMap(), &Value.SectionIndex);
+      uint16_t Size = (Form == DW_FORM_addr) ? U->getAddressByteSize()
+                                             : U->getRefAddrByteSize();
+      Value.uval = Data.getRelocatedValue(Size, OffsetPtr, &Value.SectionIndex);
       break;
     }
     case DW_FORM_exprloc:
@@ -333,11 +332,9 @@ bool DWARFFormValue::extractValue(const DataExtractor &Data,
     case DW_FORM_ref4:
     case DW_FORM_ref_sup4:
     case DW_FORM_strx4:
-    case DW_FORM_addrx4: {
-      const RelocAddrMap *RelocMap = U ? U->getRelocMap() : nullptr;
-      Value.uval = getRelocatedValue(Data, 4, OffsetPtr, RelocMap);
+    case DW_FORM_addrx4:
+      Value.uval = Data.getRelocatedValue(4, OffsetPtr);
       break;
-    }
     case DW_FORM_data8:
     case DW_FORM_ref8:
     case DW_FORM_ref_sup8:
@@ -365,8 +362,8 @@ bool DWARFFormValue::extractValue(const DataExtractor &Data,
     case DW_FORM_strp_sup: {
       if (!U)
         return false;
-      Value.uval = getRelocatedValue(Data, U->getDwarfOffsetByteSize(),
-                                     OffsetPtr, U->getRelocMap());
+      Value.uval =
+          Data.getRelocatedValue(U->getDwarfOffsetByteSize(), OffsetPtr);
       break;
     }
     case DW_FORM_flag_present:
@@ -576,7 +573,6 @@ Optional<const char *> DWARFFormValue::getAsCString() const {
     uint64_t StrOffset;
     if (!U->getStringOffsetSectionItem(Offset, StrOffset))
       return None;
-    StrOffset += U->getStringOffsetSectionRelocation(Offset);
     Offset = StrOffset;
   }
   if (const char *Str = U->getStringExtractor().getCStr(&Offset)) {
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index fd9c7c2b1d46..043bdb874f43 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -32,8 +32,7 @@ using namespace dwarf;
 void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
   parseImpl(C, Section, C.getDebugAbbrev(), &C.getRangeSection(),
             C.getStringSection(), C.getStringOffsetSection(),
-            &C.getAddrSection(), C.getLineSection().Data, C.isLittleEndian(),
-            false);
+            &C.getAddrSection(), C.getLineSection(), C.isLittleEndian(), false);
 }
 
 void DWARFUnitSectionBase::parseDWO(DWARFContext &C,
@@ -41,15 +40,15 @@ void DWARFUnitSectionBase::parseDWO(DWARFContext &C,
                                     DWARFUnitIndex *Index) {
   parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), &C.getRangeDWOSection(),
             C.getStringDWOSection(), C.getStringOffsetDWOSection(),
-            &C.getAddrSection(), C.getLineDWOSection().Data, C.isLittleEndian(),
+            &C.getAddrSection(), C.getLineDWOSection(), C.isLittleEndian(),
             true);
 }
 
 DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
                      const DWARFDebugAbbrev *DA, const DWARFSection *RS,
                      StringRef SS, const DWARFSection &SOS,
-                     const DWARFSection *AOS, StringRef LS, bool LE, bool IsDWO,
-                     const DWARFUnitSectionBase &UnitSection,
+                     const DWARFSection *AOS, const DWARFSection &LS, bool LE,
+                     bool IsDWO, const DWARFUnitSectionBase &UnitSection,
                      const DWARFUnitIndex::Entry *IndexEntry)
     : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS),
       LineSection(LS), StringSection(SS), StringOffsetSection(SOS),
@@ -65,33 +64,23 @@ bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
   uint32_t Offset = AddrOffsetSectionBase + Index * getAddressByteSize();
   if (AddrOffsetSection->Data.size() < Offset + getAddressByteSize())
     return false;
-  DataExtractor DA(AddrOffsetSection->Data, isLittleEndian,
-                   getAddressByteSize());
-  Result = getRelocatedValue(DA, getAddressByteSize(), &Offset,
-                             &AddrOffsetSection->Relocs);
+  DWARFDataExtractor DA(*AddrOffsetSection, isLittleEndian,
+                        getAddressByteSize());
+  Result = DA.getRelocatedAddress(&Offset);
   return true;
 }
 
 bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
                                            uint64_t &Result) const {
-  unsigned ItemSize = getFormat() == DWARF64 ? 8 : 4;
+  unsigned ItemSize = getDwarfOffsetByteSize();
   uint32_t Offset = StringOffsetSectionBase + Index * ItemSize;
   if (StringOffsetSection.Data.size() < Offset + ItemSize)
     return false;
-  DataExtractor DA(StringOffsetSection.Data, isLittleEndian, 0);
-  Result = ItemSize == 4 ? DA.getU32(&Offset) : DA.getU64(&Offset);
+  DWARFDataExtractor DA(StringOffsetSection, isLittleEndian, 0);
+  Result = DA.getRelocatedValue(ItemSize, &Offset);
   return true;
 }
 
-uint64_t DWARFUnit::getStringOffsetSectionRelocation(uint32_t Index) const {
-  unsigned ItemSize = getFormat() == DWARF64 ? 8 : 4;
-  uint64_t ByteOffset = StringOffsetSectionBase + Index * ItemSize;
-  RelocAddrMap::const_iterator AI = getStringOffsetsRelocMap().find(ByteOffset);
-  if (AI != getStringOffsetsRelocMap().end())
-    return AI->second.Value;
-  return 0;
-}
-
 bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
   Length = debug_info.getU32(offset_ptr);
   // FIXME: Support DWARF64.
@@ -149,14 +138,13 @@ bool DWARFUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
 }
 
 bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
-                                        DWARFDebugRangeList &RangeList) const {
+                                 DWARFDebugRangeList &RangeList) const {
   // Require that compile unit is extracted.
   assert(!DieArray.empty());
-  DataExtractor RangesData(RangeSection->Data, isLittleEndian,
-                           getAddressByteSize());
+  DWARFDataExtractor RangesData(*RangeSection, isLittleEndian,
+                                getAddressByteSize());
   uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
-  return RangeList.extract(RangesData, &ActualRangeListOffset,
-                           RangeSection->Relocs);
+  return RangeList.extract(RangesData, &ActualRangeListOffset);
 }
 
 void DWARFUnit::clear() {
@@ -190,7 +178,7 @@ void DWARFUnit::extractDIEsToVector(
   uint32_t DIEOffset = Offset + getHeaderSize();
   uint32_t NextCUOffset = getNextUnitOffset();
   DWARFDebugInfoEntry DIE;
-  DataExtractor DebugInfoData = getDebugInfoExtractor();
+  DWARFDataExtractor DebugInfoData = getDebugInfoExtractor();
   uint32_t Depth = 0;
   bool IsCUDie = true;
 
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 41907e570563..0a10e6b78911 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -280,11 +280,10 @@ bool DWARFVerifier::handleDebugLine() {
 bool DWARFVerifier::handleAppleNames() {
   NumAppleNamesErrors = 0;
 
-  DataExtractor AppleNamesSection(DCtx.getAppleNamesSection().Data,
-                                  DCtx.isLittleEndian(), 0);
+  DWARFDataExtractor AppleNamesSection(DCtx.getAppleNamesSection(),
+                                       DCtx.isLittleEndian(), 0);
   DataExtractor StrData(DCtx.getStringSection(), DCtx.isLittleEndian(), 0);
-  DWARFAcceleratorTable AppleNames(AppleNamesSection, StrData,
-                                   DCtx.getAppleNamesSection().Relocs);
+  DWARFAcceleratorTable AppleNames(AppleNamesSection, StrData);
 
   if (!AppleNames.extract()) {
     return true;
@@ -292,20 +291,80 @@ bool DWARFVerifier::handleAppleNames() {
 
   OS << "Verifying .apple_names...\n";
 
-  // Verify that all buckets have a valid hash index or are empty
+  // Verify that all buckets have a valid hash index or are empty.
   uint32_t NumBuckets = AppleNames.getNumBuckets();
   uint32_t NumHashes = AppleNames.getNumHashes();
 
   uint32_t BucketsOffset =
       AppleNames.getSizeHdr() + AppleNames.getHeaderDataLength();
+  uint32_t HashesBase = BucketsOffset + NumBuckets * 4;
+  uint32_t OffsetsBase = HashesBase + NumHashes * 4;
 
   for (uint32_t BucketIdx = 0; BucketIdx < NumBuckets; ++BucketIdx) {
     uint32_t HashIdx = AppleNamesSection.getU32(&BucketsOffset);
     if (HashIdx >= NumHashes && HashIdx != UINT32_MAX) {
-      OS << format("error: Bucket[%d] has invalid hash index: [%d]\n",
-                   BucketIdx, HashIdx);
+      OS << format("error: Bucket[%d] has invalid hash index: %u\n", BucketIdx,
+                   HashIdx);
       ++NumAppleNamesErrors;
     }
   }
+
+  uint32_t NumAtoms = AppleNames.getAtomsDesc().size();
+  if (NumAtoms == 0) {
+    OS << "error: no atoms; failed to read HashData\n";
+    ++NumAppleNamesErrors;
+    return false;
+  }
+
+  if (!AppleNames.validateForms()) {
+    OS << "error: unsupported form; failed to read HashData\n";
+    ++NumAppleNamesErrors;
+    return false;
+  }
+
+  for (uint32_t HashIdx = 0; HashIdx < NumHashes; ++HashIdx) {
+    uint32_t HashOffset = HashesBase + 4 * HashIdx;
+    uint32_t DataOffset = OffsetsBase + 4 * HashIdx;
+    uint32_t Hash = AppleNamesSection.getU32(&HashOffset);
+    uint32_t HashDataOffset = AppleNamesSection.getU32(&DataOffset);
+    if (!AppleNamesSection.isValidOffsetForDataOfSize(HashDataOffset,
+                                                      sizeof(uint64_t))) {
+      OS << format("error: Hash[%d] has invalid HashData offset: 0x%08x\n",
+                   HashIdx, HashDataOffset);
+      ++NumAppleNamesErrors;
+    }
+
+    uint32_t StrpOffset;
+    uint32_t StringOffset;
+    uint32_t StringCount = 0;
+    uint32_t DieOffset = dwarf::DW_INVALID_OFFSET;
+
+    while ((StrpOffset = AppleNamesSection.getU32(&HashDataOffset)) != 0) {
+      const uint32_t NumHashDataObjects =
+          AppleNamesSection.getU32(&HashDataOffset);
+      for (uint32_t HashDataIdx = 0; HashDataIdx < NumHashDataObjects;
+           ++HashDataIdx) {
+        DieOffset = AppleNames.readAtoms(HashDataOffset);
+        if (!DCtx.getDIEForOffset(DieOffset)) {
+          const uint32_t BucketIdx =
+              NumBuckets ? (Hash % NumBuckets) : UINT32_MAX;
+          StringOffset = StrpOffset;
+          const char *Name = StrData.getCStr(&StringOffset);
+          if (!Name)
+            Name = "<NULL>";
+
+          OS << format(
+              "error: .apple_names Bucket[%d] Hash[%d] = 0x%08x "
+              "Str[%u] = 0x%08x "
+              "DIE[%d] = 0x%08x is not a valid DIE offset for \"%s\".\n",
+              BucketIdx, HashIdx, Hash, StringCount, StrpOffset, HashDataIdx,
+              DieOffset, Name);
+
+          ++NumAppleNamesErrors;
+        }
+      }
+      ++StringCount;
+    }
+  }
   return NumAppleNamesErrors == 0;
 }
diff --git a/lib/DebugInfo/PDB/Native/DbiModuleList.cpp b/lib/DebugInfo/PDB/Native/DbiModuleList.cpp
index 434f775097e0..eea70b229c67 100644
--- a/lib/DebugInfo/PDB/Native/DbiModuleList.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiModuleList.cpp
@@ -1,4 +1,4 @@
-//===- DbiModuleList.cpp - PDB module information list ----------*- C++ -*-===//
+//===- DbiModuleList.cpp - PDB module information list --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,10 +6,17 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
 
+#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/lib/DebugInfo/PDB/Native/Hash.cpp b/lib/DebugInfo/PDB/Native/Hash.cpp
index 2ad3f55dc5c3..61188ece2dcb 100644
--- a/lib/DebugInfo/PDB/Native/Hash.cpp
+++ b/lib/DebugInfo/PDB/Native/Hash.cpp
@@ -8,10 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
-
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/JamCRC.h"
+#include <cstdint>
 
 using namespace llvm;
 using namespace llvm::support;
diff --git a/lib/DebugInfo/PDB/Native/HashTable.cpp b/lib/DebugInfo/PDB/Native/HashTable.cpp
index ebf8c9c04db1..439217f91d04 100644
--- a/lib/DebugInfo/PDB/Native/HashTable.cpp
+++ b/lib/DebugInfo/PDB/Native/HashTable.cpp
@@ -1,4 +1,4 @@
-//===- HashTable.cpp - PDB Hash Table ---------------------------*- C++ -*-===//
+//===- HashTable.cpp - PDB Hash Table -------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,12 +8,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
-
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
-
-#include <assert.h>
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
@@ -106,9 +110,11 @@ void HashTable::clear() {
 }
 
 uint32_t HashTable::capacity() const { return Buckets.size(); }
+
 uint32_t HashTable::size() const { return Present.count(); }
 
 HashTableIterator HashTable::begin() const { return HashTableIterator(*this); }
+
 HashTableIterator HashTable::end() const {
   return HashTableIterator(*this, 0, true);
 }
diff --git a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
index 83c56574a16e..2e1f61c7a25d 100644
--- a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
+++ b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
@@ -9,11 +9,11 @@
 
 #include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
@@ -97,7 +97,7 @@ ModuleDebugStreamRef::symbols(bool *HadError) const {
   return make_range(SymbolArray.begin(HadError), SymbolArray.end());
 }
 
-llvm::iterator_range<ModuleDebugStreamRef::DebugSubsectionIterator>
+iterator_range<ModuleDebugStreamRef::DebugSubsectionIterator>
 ModuleDebugStreamRef::subsections() const {
   return make_range(Subsections.begin(), Subsections.end());
 }
diff --git a/lib/DebugInfo/PDB/Native/ModuleDebugStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/ModuleDebugStreamBuilder.cpp
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/lib/DebugInfo/PDB/Native/ModuleDebugStreamBuilder.cpp
+++ /dev/null
diff --git a/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp b/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
index 4f90cd9cd8ac..354b8c0e07ff 100644
--- a/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
+++ b/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
@@ -1,4 +1,4 @@
-//===- NamedStreamMap.cpp - PDB Named Stream Map ----------------*- C++ -*-===//
+//===- NamedStreamMap.cpp - PDB Named Stream Map --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,17 +8,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
-
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
+#include <cassert>
 #include <cstdint>
+#include <tuple>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp b/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
index c23120041164..a65782e2d4fc 100644
--- a/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
@@ -32,9 +32,7 @@ std::unique_ptr<PDBSymbol>
 NativeEnumModules::getChildAtIndex(uint32_t Index) const {
   if (Index >= Modules.getModuleCount())
     return nullptr;
-  return std::unique_ptr<PDBSymbol>(new PDBSymbolCompiland(
-      Session, std::unique_ptr<IPDBRawSymbol>(new NativeCompilandSymbol(
-                   Session, 0, Modules.getModuleDescriptor(Index)))));
+  return Session.createCompilandSymbol(Modules.getModuleDescriptor(Index));
 }
 
 std::unique_ptr<PDBSymbol> NativeEnumModules::getNext() {
diff --git a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
index ed6db63edbab..b4f5c96ce66b 100644
--- a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
@@ -1,4 +1,4 @@
-//===- NativeRawSymbol.cpp - Native implementation of IPDBRawSymbol -*- C++ -*-===//
+//===- NativeRawSymbol.cpp - Native implementation of IPDBRawSymbol -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,16 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/PDBExtras.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
-#include "llvm/Support/ConvertUTF.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
@@ -49,7 +40,7 @@ NativeRawSymbol::findInlineFramesByRVA(uint32_t RVA) const {
   return nullptr;
 }
 
-void NativeRawSymbol::getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const {
+void NativeRawSymbol::getDataBytes(SmallVector<uint8_t, 32> &bytes) const {
   bytes.clear();
 }
 
@@ -109,7 +100,7 @@ uint32_t NativeRawSymbol::getClassParentId() const {
 }
 
 std::string NativeRawSymbol::getCompilerName() const {
-  return 0;
+  return {};
 }
 
 uint32_t NativeRawSymbol::getCount() const {
@@ -136,7 +127,7 @@ uint32_t NativeRawSymbol::getLexicalParentId() const {
 }
 
 std::string NativeRawSymbol::getLibraryName() const {
-  return "";
+  return {};
 }
 
 uint32_t NativeRawSymbol::getLiveRangeStartAddressOffset() const {
@@ -164,7 +155,7 @@ uint32_t NativeRawSymbol::getMemorySpaceKind() const {
 }
 
 std::string NativeRawSymbol::getName() const {
-  return 0;
+  return {};
 }
 
 uint32_t NativeRawSymbol::getNumberOfAcceleratorPointerTags() const {
@@ -188,7 +179,7 @@ uint32_t NativeRawSymbol::getNumberOfRows() const {
 }
 
 std::string NativeRawSymbol::getObjectFileName() const {
-  return "";
+  return {};
 }
 
 uint32_t NativeRawSymbol::getOemId() const {
@@ -240,7 +231,7 @@ uint32_t NativeRawSymbol::getSlot() const {
 }
 
 std::string NativeRawSymbol::getSourceFileName() const {
-  return 0;
+  return {};
 }
 
 uint32_t NativeRawSymbol::getStride() const {
@@ -251,7 +242,7 @@ uint32_t NativeRawSymbol::getSubTypeId() const {
   return 0;
 }
 
-std::string NativeRawSymbol::getSymbolsFileName() const { return ""; }
+std::string NativeRawSymbol::getSymbolsFileName() const { return {}; }
 
 uint32_t NativeRawSymbol::getSymIndexId() const { return SymbolId; }
 
@@ -292,7 +283,7 @@ uint32_t NativeRawSymbol::getUavSlot() const {
 }
 
 std::string NativeRawSymbol::getUndecoratedName() const {
-  return 0;
+  return {};
 }
 
 uint32_t NativeRawSymbol::getUnmodifiedTypeId() const {
@@ -701,5 +692,5 @@ bool NativeRawSymbol::wasInlined() const {
 }
 
 std::string NativeRawSymbol::getUnused() const {
-  return "";
+  return {};
 }
diff --git a/lib/DebugInfo/PDB/Native/NativeSession.cpp b/lib/DebugInfo/PDB/Native/NativeSession.cpp
index 3ab381e76e62..93d43d9ef341 100644
--- a/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -13,6 +13,7 @@
 #include "llvm/DebugInfo/PDB/GenericError.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
@@ -23,8 +24,10 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
+
 #include <algorithm>
 #include <memory>
+#include <utility>
 
 using namespace llvm;
 using namespace llvm::msf;
@@ -66,12 +69,23 @@ Error NativeSession::createFromExe(StringRef Path,
   return make_error<RawError>(raw_error_code::feature_unsupported);
 }
 
+std::unique_ptr<PDBSymbolCompiland>
+NativeSession::createCompilandSymbol(DbiModuleDescriptor MI) {
+  const auto Id = static_cast<uint32_t>(SymbolCache.size());
+  SymbolCache.push_back(
+      llvm::make_unique<NativeCompilandSymbol>(*this, Id, MI));
+  return llvm::make_unique<PDBSymbolCompiland>(
+      *this, std::unique_ptr<IPDBRawSymbol>(SymbolCache[Id]->clone()));
+}
+
 uint64_t NativeSession::getLoadAddress() const { return 0; }
 
 void NativeSession::setLoadAddress(uint64_t Address) {}
 
 std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() {
-  auto RawSymbol = llvm::make_unique<NativeExeSymbol>(*this, 0);
+  const auto Id = static_cast<uint32_t>(SymbolCache.size());
+  SymbolCache.push_back(llvm::make_unique<NativeExeSymbol>(*this, Id));
+  auto RawSymbol = SymbolCache[Id]->clone();
   auto PdbSymbol(PDBSymbol::create(*this, std::move(RawSymbol)));
   std::unique_ptr<PDBSymbolExe> ExeSymbol(
       static_cast<PDBSymbolExe *>(PdbSymbol.release()));
@@ -80,7 +94,10 @@ std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() {
 
 std::unique_ptr<PDBSymbol>
 NativeSession::getSymbolById(uint32_t SymbolId) const {
-  return nullptr;
+  // If the caller has a SymbolId, it'd better be in our SymbolCache.
+  return SymbolId < SymbolCache.size()
+             ? PDBSymbol::create(*this, SymbolCache[SymbolId]->clone())
+             : nullptr;
 }
 
 std::unique_ptr<PDBSymbol>
diff --git a/lib/DebugInfo/PDB/PDB.cpp b/lib/DebugInfo/PDB/PDB.cpp
index 7e3acc1165f3..501d4f5985b7 100644
--- a/lib/DebugInfo/PDB/PDB.cpp
+++ b/lib/DebugInfo/PDB/PDB.cpp
@@ -1,4 +1,4 @@
-//===- PDB.cpp - base header file for creating a PDB reader -----*- C++ -*-===//
+//===- PDB.cpp - base header file for creating a PDB reader ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,18 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/PDB.h"
-
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
 #include "llvm/DebugInfo/PDB/GenericError.h"
-#include "llvm/DebugInfo/PDB/IPDBSession.h"
-#include "llvm/DebugInfo/PDB/PDB.h"
 #if LLVM_ENABLE_DIA_SDK
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #endif
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Error.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
@@ -33,7 +29,7 @@ Error llvm::pdb::loadDataForPDB(PDB_ReaderType Type, StringRef Path,
 #if LLVM_ENABLE_DIA_SDK
   return DIASession::createFromPdb(Path, Session);
 #else
-  return llvm::make_error<GenericError>("DIA is not installed on the system");
+  return make_error<GenericError>("DIA is not installed on the system");
 #endif
 }
 
@@ -46,6 +42,6 @@ Error llvm::pdb::loadDataForEXE(PDB_ReaderType Type, StringRef Path,
 #if LLVM_ENABLE_DIA_SDK
   return DIASession::createFromExe(Path, Session);
 #else
-  return llvm::make_error<GenericError>("DIA is not installed on the system");
+  return make_error<GenericError>("DIA is not installed on the system");
 #endif
 }
diff --git a/lib/DebugInfo/PDB/PDBExtras.cpp b/lib/DebugInfo/PDB/PDBExtras.cpp
index dc22a30facab..faf1142ddf17 100644
--- a/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -1,4 +1,4 @@
-//===- PDBExtras.cpp - helper functions and classes for PDBs -----*- C++-*-===//
+//===- PDBExtras.cpp - helper functions and classes for PDBs --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
-
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/CodeView/Formatters.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/lib/DebugInfo/PDB/UDTLayout.cpp b/lib/DebugInfo/PDB/UDTLayout.cpp
index da353cb6977c..5f4390bbaf12 100644
--- a/lib/DebugInfo/PDB/UDTLayout.cpp
+++ b/lib/DebugInfo/PDB/UDTLayout.cpp
@@ -1,4 +1,4 @@
-//===- UDTLayout.cpp --------------------------------------------*- C++ -*-===//
+//===- UDTLayout.cpp ------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,20 +8,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/UDTLayout.h"
-
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
-
-#include <utility>
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
 
 using namespace llvm;
 using namespace llvm::pdb;
@@ -176,7 +181,6 @@ void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) {
       else
         Bases.push_back(std::move(Base));
     }
-
     else if (auto Data = unique_dyn_cast<PDBSymbolData>(Child)) {
       if (Data->getDataKind() == PDB_DataKind::Member)
         Members.push_back(std::move(Data));
@@ -296,4 +300,4 @@ void UDTLayoutBase::addChildToLayout(std::unique_ptr<LayoutItemBase> Child) {
   }
 
   ChildStorage.push_back(std::move(Child));
-}
-\ No newline at end of file
+}
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index b20690c7caaf..690276232a6f 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -193,11 +193,11 @@ public:
     }
     auto *MPtr = M.release();
     ShouldDelete[MPtr] = true;
-    auto Deleter =
-      [this](Module *Mod) {
-        if (ShouldDelete[Mod])
-	  delete Mod;
-      };
+    auto Deleter = [this](Module *Mod) {
+      auto I = ShouldDelete.find(Mod);
+      if (I != ShouldDelete.end() && I->second)
+        delete Mod;
+    };
     LocalModules.push_back(std::shared_ptr<Module>(MPtr, std::move(Deleter)));
     LazyEmitLayer.addModule(LocalModules.back(), &MemMgr, &Resolver);
   }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 3d12eadea4dd..8b6f9bef66df 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -78,11 +78,11 @@ public:
   void updateSymbolAddress(const SymbolRef &SymRef, uint64_t Addr);
 
   // Methods for type inquiry through isa, cast and dyn_cast
-  static inline bool classof(const Binary *v) {
+  static bool classof(const Binary *v) {
     return (isa<ELFObjectFile<ELFT>>(v) &&
             classof(cast<ELFObjectFile<ELFT>>(v)));
   }
-  static inline bool classof(const ELFObjectFile<ELFT> *v) {
+  static bool classof(const ELFObjectFile<ELFT> *v) {
     return v->isDyldType();
   }
 };
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 27150a89d9b2..d387a6f0ecb9 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -716,7 +716,7 @@ bool ConstantFP::isExactlyValue(const APFloat &V) const {
 
 /// Remove the constant from the constant table.
 void ConstantFP::destroyConstantImpl() {
-  llvm_unreachable("You can't ConstantInt->destroyConstantImpl()!");
+  llvm_unreachable("You can't ConstantFP->destroyConstantImpl()!");
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp
index 37e735251fdf..9bd0e297f4ef 100644
--- a/lib/IR/Dominators.cpp
+++ b/lib/IR/Dominators.cpp
@@ -63,15 +63,22 @@ bool BasicBlockEdge::isSingleEdge() const {
 template class llvm::DomTreeNodeBase<BasicBlock>;
 template class llvm::DominatorTreeBase<BasicBlock>;
 
-template void llvm::Calculate<Function, BasicBlock *>(
+template void llvm::DomTreeBuilder::Calculate<Function, BasicBlock *>(
     DominatorTreeBase<
         typename std::remove_pointer<GraphTraits<BasicBlock *>::NodeRef>::type>
         &DT,
     Function &F);
-template void llvm::Calculate<Function, Inverse<BasicBlock *>>(
+template void llvm::DomTreeBuilder::Calculate<Function, Inverse<BasicBlock *>>(
     DominatorTreeBase<typename std::remove_pointer<
         GraphTraits<Inverse<BasicBlock *>>::NodeRef>::type> &DT,
     Function &F);
+template bool llvm::DomTreeBuilder::Verify<BasicBlock *>(
+    const DominatorTreeBase<
+        typename std::remove_pointer<GraphTraits<BasicBlock *>::NodeRef>::type>
+        &DT);
+template bool llvm::DomTreeBuilder::Verify<Inverse<BasicBlock *>>(
+    const DominatorTreeBase<typename std::remove_pointer<
+        GraphTraits<Inverse<BasicBlock *>>::NodeRef>::type> &DT);
 
 bool DominatorTree::invalidate(Function &F, const PreservedAnalyses &PA,
                                FunctionAnalysisManager::Invalidator &) {
@@ -285,6 +292,13 @@ bool DominatorTree::isReachableFromEntry(const Use &U) const {
 }
 
 void DominatorTree::verifyDomTree() const {
+  // Perform the expensive checks only when VerifyDomInfo is set.
+  if (VerifyDomInfo && !verify()) {
+    errs() << "\n~~~~~~~~~~~\n\t\tDomTree verification failed!\n~~~~~~~~~~~\n";
+    print(errs());
+    abort();
+  }
+
   Function &F = *getRoot()->getParent();
 
   DominatorTree OtherDT;
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index ad0d4470c111..2e13f362344d 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -125,11 +125,18 @@ void LLVMContext::setDiagnosticHandler(DiagnosticHandlerTy DiagnosticHandler,
   pImpl->RespectDiagnosticFilters = RespectFilters;
 }
 
-void LLVMContext::setDiagnosticHotnessRequested(bool Requested) {
-  pImpl->DiagnosticHotnessRequested = Requested;
+void LLVMContext::setDiagnosticsHotnessRequested(bool Requested) {
+  pImpl->DiagnosticsHotnessRequested = Requested;
 }
-bool LLVMContext::getDiagnosticHotnessRequested() const {
-  return pImpl->DiagnosticHotnessRequested;
+bool LLVMContext::getDiagnosticsHotnessRequested() const {
+  return pImpl->DiagnosticsHotnessRequested;
+}
+
+void LLVMContext::setDiagnosticsHotnessThreshold(uint64_t Threshold) {
+  pImpl->DiagnosticsHotnessThreshold = Threshold;
+}
+uint64_t LLVMContext::getDiagnosticsHotnessThreshold() const {
+  return pImpl->DiagnosticsHotnessThreshold;
 }
 
 yaml::Output *LLVMContext::getDiagnosticsOutputFile() {
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 4147f71ad9d2..395beb57fe37 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -1169,7 +1169,8 @@ public:
   LLVMContext::DiagnosticHandlerTy DiagnosticHandler = nullptr;
   void *DiagnosticContext = nullptr;
   bool RespectDiagnosticFilters = false;
-  bool DiagnosticHotnessRequested = false;
+  bool DiagnosticsHotnessRequested = false;
+  uint64_t DiagnosticsHotnessThreshold = 0;
   std::unique_ptr<yaml::Output> DiagnosticsOutputFile;
 
   LLVMContext::YieldCallbackTy YieldCallback = nullptr;
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 35032fdd33e1..68b8c9fcb939 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -472,6 +472,36 @@ Error LTO::addModule(InputFile &Input, unsigned ModI,
   return Error::success();
 }
 
+// Checks whether the given global value is in a non-prevailing comdat
+// (comdat containing values the linker indicated were not prevailing,
+// which we then dropped to available_externally), and if so, removes
+// it from the comdat. This is called for all global values to ensure the
+// comdat is empty rather than leaving an incomplete comdat. It is needed for
+// regular LTO modules, in case we are in a mixed-LTO mode (both regular
+// and thin LTO modules) compilation. Since the regular LTO module will be
+// linked first in the final native link, we want to make sure the linker
+// doesn't select any of these incomplete comdats that would be left
+// in the regular LTO module without this cleanup.
+static void
+handleNonPrevailingComdat(GlobalValue &GV,
+                          std::set<const Comdat *> &NonPrevailingComdats) {
+  Comdat *C = GV.getComdat();
+  if (!C)
+    return;
+
+  if (!NonPrevailingComdats.count(C))
+    return;
+
+  // Additionally need to drop externally visible global values from the comdat
+  // to available_externally, so that there aren't multiply defined linker
+  // errors.
+  if (!GV.hasLocalLinkage())
+    GV.setLinkage(GlobalValue::AvailableExternallyLinkage);
+
+  if (auto GO = dyn_cast<GlobalObject>(&GV))
+    GO->setComdat(nullptr);
+}
+
 // Add a regular LTO object to the link.
 // The resulting module needs to be linked into the combined LTO module with
 // linkRegularLTO.
@@ -523,6 +553,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
   };
   Skip();
 
+  std::set<const Comdat *> NonPrevailingComdats;
   for (const InputFile::Symbol &Sym : Syms) {
     assert(ResI != ResE);
     SymbolResolution Res = *ResI++;
@@ -557,6 +588,8 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
         // module (in linkRegularLTO), based on whether it is undefined.
         Mod.Keep.push_back(GV);
         GV->setLinkage(GlobalValue::AvailableExternallyLinkage);
+        if (GV->hasComdat())
+          NonPrevailingComdats.insert(GV->getComdat());
         cast<GlobalObject>(GV)->setComdat(nullptr);
       }
     }
@@ -574,6 +607,9 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
 
     // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit.
   }
+  if (!M.getComdatSymbolTable().empty())
+    for (GlobalValue &GV : M.global_values())
+      handleNonPrevailingComdat(GV, NonPrevailingComdats);
   assert(MsymI == MsymE);
   return std::move(Mod);
 }
@@ -1087,7 +1123,7 @@ lto::setupOptimizationRemarks(LLVMContext &Context,
   Context.setDiagnosticsOutputFile(
       llvm::make_unique<yaml::Output>(DiagnosticFile->os()));
   if (LTOPassRemarksWithHotness)
-    Context.setDiagnosticHotnessRequested(true);
+    Context.setDiagnosticsHotnessRequested(true);
   DiagnosticFile->keep();
   return std::move(DiagnosticFile);
 }
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 92c5da0e9fef..0318d916aa49 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -261,9 +261,9 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
     Value -= Offset;
   }
 
-  // Let the backend adjust the fixup value if necessary, including whether
-  // we need a relocation.
-  Backend.processFixupValue(*this, Fixup, Target, IsResolved);
+  // Let the backend force a relocation if needed.
+  if (IsResolved && Backend.shouldForceRelocation(*this, Fixup, Target))
+    IsResolved = false;
 
   return IsResolved;
 }
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index 45534ba18212..82352cb50c70 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -265,7 +265,8 @@ private:
                         uint32_t NumFuncImports);
   void writeCodeRelocSection();
   void writeDataRelocSection(uint64_t DataSectionHeaderSize);
-  void writeLinkingMetaDataSection(ArrayRef<StringRef> WeakSymbols,
+  void writeLinkingMetaDataSection(uint32_t DataSize, uint32_t DataAlignment,
+                                   ArrayRef<StringRef> WeakSymbols,
                                    bool HasStackPointer,
                                    uint32_t StackPointerGlobal);
 
@@ -877,11 +878,8 @@ void WasmObjectWriter::writeDataRelocSection(uint64_t DataSectionHeaderSize) {
 }
 
 void WasmObjectWriter::writeLinkingMetaDataSection(
-    ArrayRef<StringRef> WeakSymbols, bool HasStackPointer,
-    uint32_t StackPointerGlobal) {
-  if (!HasStackPointer && WeakSymbols.empty())
-    return;
-
+    uint32_t DataSize, uint32_t DataAlignment, ArrayRef<StringRef> WeakSymbols,
+    bool HasStackPointer, uint32_t StackPointerGlobal) {
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_CUSTOM, "linking");
   SectionBookkeeping SubSection;
@@ -902,6 +900,16 @@ void WasmObjectWriter::writeLinkingMetaDataSection(
     endSection(SubSection);
   }
 
+  if (DataSize > 0) {
+    startSection(SubSection, wasm::WASM_DATA_SIZE);
+    encodeULEB128(DataSize, getStream());
+    endSection(SubSection);
+
+    startSection(SubSection, wasm::WASM_DATA_ALIGNMENT);
+    encodeULEB128(DataAlignment, getStream());
+    endSection(SubSection);
+  }
+
   endSection(Section);
 }
 
@@ -923,6 +931,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
   unsigned NumFuncImports = 0;
   unsigned NumGlobalImports = 0;
   SmallVector<char, 0> DataBytes;
+  uint32_t DataAlignment = 1;
   uint32_t StackPointerGlobal = 0;
   bool HasStackPointer = false;
 
@@ -1157,6 +1166,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
         report_fatal_error("data sections must contain at most one variable");
 
       DataBytes.resize(alignTo(DataBytes.size(), DataSection.getAlignment()));
+      DataAlignment = std::max(DataAlignment, DataSection.getAlignment());
 
       DataSection.setSectionOffset(DataBytes.size());
 
@@ -1272,7 +1282,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
   writeNameSection(Functions, Imports, NumFuncImports);
   writeCodeRelocSection();
   writeDataRelocSection(DataSectionHeaderSize);
-  writeLinkingMetaDataSection(WeakSymbols, HasStackPointer, StackPointerGlobal);
+  writeLinkingMetaDataSection(DataBytes.size(), DataAlignment, WeakSymbols, HasStackPointer, StackPointerGlobal);
 
   // TODO: Translate the .comment section to the output.
   // TODO: Translate debug sections to the output.
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index 1d08a9efd8b3..fd5e7707c541 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -27,4 +27,5 @@ add_llvm_library(LLVMObject
 
   DEPENDS
   intrinsics_gen
+  llvm_vcsrevision_h
   )
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 9a760d86e7e2..1e9b0c5b0454 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -883,7 +883,7 @@ base_reloc_iterator COFFObjectFile::base_reloc_end() const {
 }
 
 uint8_t COFFObjectFile::getBytesInAddress() const {
-  return getArch() == Triple::x86_64 ? 8 : 4;
+  return getArch() == Triple::x86_64 || getArch() == Triple::aarch64 ? 8 : 4;
 }
 
 StringRef COFFObjectFile::getFileFormatName() const {
@@ -1216,6 +1216,29 @@ void COFFObjectFile::getRelocationTypeName(
       Res = "Unknown";
     }
     break;
+  case COFF::IMAGE_FILE_MACHINE_ARM64:
+    switch (Reloc->Type) {
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ABSOLUTE);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ADDR32);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ADDR32NB);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_BRANCH26);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_PAGEBASE_REL21);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_REL21);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_PAGEOFFSET_12A);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_PAGEOFFSET_12L);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL_LOW12A);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL_HIGH12A);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL_LOW12L);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_TOKEN);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECTION);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ADDR64);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_BRANCH19);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_BRANCH14);
+    default:
+      Res = "Unknown";
+    }
+    break;
   case COFF::IMAGE_FILE_MACHINE_I386:
     switch (Reloc->Type) {
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_ABSOLUTE);
diff --git a/lib/Object/IRSymtab.cpp b/lib/Object/IRSymtab.cpp
index 7bca032a7be1..7a6424a76a98 100644
--- a/lib/Object/IRSymtab.cpp
+++ b/lib/Object/IRSymtab.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/StringSaver.h"
+#include "llvm/Support/VCSRevision.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <string>
@@ -43,6 +44,21 @@ using namespace irsymtab;
 
 namespace {
 
+const char *getExpectedProducerName() {
+  static char DefaultName[] = LLVM_VERSION_STRING
+#ifdef LLVM_REVISION
+      " " LLVM_REVISION
+#endif
+      ;
+  // Allows for testing of the irsymtab writer and upgrade mechanism. This
+  // environment variable should not be set by users.
+  if (char *OverrideName = getenv("LLVM_OVERRIDE_PRODUCER"))
+    return OverrideName;
+  return DefaultName;
+}
+
+const char *kExpectedProducerName = getExpectedProducerName();
+
 /// Stores the temporary state that is required to build an IR symbol table.
 struct Builder {
   SmallVector<char, 0> &Symtab;
@@ -231,6 +247,8 @@ Error Builder::build(ArrayRef<Module *> IRMods) {
   storage::Header Hdr;
 
   assert(!IRMods.empty());
+  Hdr.Version = storage::Header::kCurrentVersion;
+  setStr(Hdr.Producer, kExpectedProducerName);
   setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple());
   setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName());
   TT = Triple(IRMods[0]->getTargetTriple());
@@ -300,7 +318,31 @@ Expected<FileContents> irsymtab::readBitcode(const BitcodeFileContents &BFC) {
     return make_error<StringError>("Bitcode file does not contain any modules",
                                    inconvertibleErrorCode());
 
-  // Right now we have no on-disk representation of symbol tables, so we always
-  // upgrade.
-  return upgrade(BFC.Mods);
+  if (BFC.StrtabForSymtab.empty() ||
+      BFC.Symtab.size() < sizeof(storage::Header))
+    return upgrade(BFC.Mods);
+
+  // We cannot use the regular reader to read the version and producer, because
+  // it will expect the header to be in the current format. The only thing we
+  // can rely on is that the version and producer will be present as the first
+  // struct elements.
+  auto *Hdr = reinterpret_cast<const storage::Header *>(BFC.Symtab.data());
+  unsigned Version = Hdr->Version;
+  StringRef Producer = Hdr->Producer.get(BFC.StrtabForSymtab);
+  if (Version != storage::Header::kCurrentVersion ||
+      Producer != kExpectedProducerName)
+    return upgrade(BFC.Mods);
+
+  FileContents FC;
+  FC.TheReader = {{BFC.Symtab.data(), BFC.Symtab.size()},
+                  {BFC.StrtabForSymtab.data(), BFC.StrtabForSymtab.size()}};
+
+  // Finally, make sure that the number of modules in the symbol table matches
+  // the number of modules in the bitcode file. If they differ, it may mean that
+  // the bitcode file was created by binary concatenation, so we need to create
+  // a new symbol table from scratch.
+  if (FC.TheReader.getNumModules() != BFC.Mods.size())
+    return upgrade(std::move(BFC.Mods));
+
+  return std::move(FC);
 }
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index d15860674aeb..fff497ba5564 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -193,6 +193,9 @@ static Error readSection(WasmSection &Section, const uint8_t *&Ptr,
 
 WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
     : ObjectFile(Binary::ID_Wasm, Buffer) {
+  LinkingData.DataAlignment = 0;
+  LinkingData.DataSize = 0;
+
   ErrorAsOutParameter ErrAsOutParam(&Err);
   Header.Magic = getData().substr(0, 4);
   if (Header.Magic != StringRef("\0asm", 4)) {
@@ -291,6 +294,7 @@ Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) {
 
 Error WasmObjectFile::parseLinkingSection(const uint8_t *Ptr,
                                           const uint8_t *End) {
+  HasLinkingSection = true;
   while (Ptr < End) {
     uint8_t Type = readVarint7(Ptr);
     uint32_t Size = readVaruint32(Ptr);
@@ -305,7 +309,7 @@ Error WasmObjectFile::parseLinkingSection(const uint8_t *Ptr,
         auto iter = SymbolMap.find(Symbol);
         if (iter == SymbolMap.end()) {
           return make_error<GenericBinaryError>(
-              "Invalid symbol name in linking section",
+              "Invalid symbol name in linking section: " + Symbol,
               object_error::parse_failed);
         }
         uint32_t SymIndex = iter->second;
@@ -318,6 +322,12 @@ Error WasmObjectFile::parseLinkingSection(const uint8_t *Ptr,
       }
       break;
     }
+    case wasm::WASM_DATA_SIZE:
+      LinkingData.DataSize = readVaruint32(Ptr);
+      break;
+    case wasm::WASM_DATA_ALIGNMENT:
+      LinkingData.DataAlignment = readVaruint32(Ptr);
+      break;
     case wasm::WASM_STACK_POINTER:
     default:
       Ptr += Size;
@@ -941,7 +951,9 @@ SubtargetFeatures WasmObjectFile::getFeatures() const {
   return SubtargetFeatures();
 }
 
-bool WasmObjectFile::isRelocatableObject() const { return false; }
+bool WasmObjectFile::isRelocatableObject() const {
+  return HasLinkingSection;
+}
 
 const WasmSection &WasmObjectFile::getWasmSection(DataRefImpl Ref) const {
   assert(Ref.d.a < Sections.size());
diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp
index ff9b9ca35eb5..1371eacdf8f2 100644
--- a/lib/Object/WindowsResource.cpp
+++ b/lib/Object/WindowsResource.cpp
@@ -563,7 +563,7 @@ void WindowsResourceCOFFWriter::writeSymbolTable() {
     Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
     strncpy(Symbol->Name.ShortName, RelocationName, (size_t)COFF::NameSize);
     Symbol->Value = DataOffsets[i];
-    Symbol->SectionNumber = 1;
+    Symbol->SectionNumber = 2;
     Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
     Symbol->StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
     Symbol->NumberOfAuxSymbols = 0;
diff --git a/lib/ObjectYAML/COFFYAML.cpp b/lib/ObjectYAML/COFFYAML.cpp
index c8cbea1490f6..1103159fc98d 100644
--- a/lib/ObjectYAML/COFFYAML.cpp
+++ b/lib/ObjectYAML/COFFYAML.cpp
@@ -12,17 +12,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/COFFYAML.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <cstdint>
+#include <cstring>
 
 #define ECase(X) IO.enumCase(Value, #X, COFF::X);
+
 namespace llvm {
 
 namespace COFFYAML {
+
 Section::Section() { memset(&Header, 0, sizeof(COFF::section)); }
 Symbol::Symbol() { memset(&Header, 0, sizeof(COFF::symbol)); }
 Object::Object() { memset(&Header, 0, sizeof(COFF::header)); }
-}
+
+} // end namespace COFFYAML
 
 namespace yaml {
+
 void ScalarEnumerationTraits<COFFYAML::COMDATType>::enumeration(
     IO &IO, COFFYAML::COMDATType &Value) {
   IO.enumCase(Value, "0", 0);
@@ -172,20 +180,20 @@ void ScalarEnumerationTraits<COFF::RelocationTypeAMD64>::enumeration(
 
 void ScalarEnumerationTraits<COFF::WindowsSubsystem>::enumeration(
     IO &IO, COFF::WindowsSubsystem &Value) {
-    ECase(IMAGE_SUBSYSTEM_UNKNOWN);
-    ECase(IMAGE_SUBSYSTEM_NATIVE);
-    ECase(IMAGE_SUBSYSTEM_WINDOWS_GUI);
-    ECase(IMAGE_SUBSYSTEM_WINDOWS_CUI);
-    ECase(IMAGE_SUBSYSTEM_OS2_CUI);
-    ECase(IMAGE_SUBSYSTEM_POSIX_CUI);
-    ECase(IMAGE_SUBSYSTEM_NATIVE_WINDOWS);
-    ECase(IMAGE_SUBSYSTEM_WINDOWS_CE_GUI);
-    ECase(IMAGE_SUBSYSTEM_EFI_APPLICATION);
-    ECase(IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER);
-    ECase(IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER);
-    ECase(IMAGE_SUBSYSTEM_EFI_ROM);
-    ECase(IMAGE_SUBSYSTEM_XBOX);
-    ECase(IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION);
+  ECase(IMAGE_SUBSYSTEM_UNKNOWN);
+  ECase(IMAGE_SUBSYSTEM_NATIVE);
+  ECase(IMAGE_SUBSYSTEM_WINDOWS_GUI);
+  ECase(IMAGE_SUBSYSTEM_WINDOWS_CUI);
+  ECase(IMAGE_SUBSYSTEM_OS2_CUI);
+  ECase(IMAGE_SUBSYSTEM_POSIX_CUI);
+  ECase(IMAGE_SUBSYSTEM_NATIVE_WINDOWS);
+  ECase(IMAGE_SUBSYSTEM_WINDOWS_CE_GUI);
+  ECase(IMAGE_SUBSYSTEM_EFI_APPLICATION);
+  ECase(IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER);
+  ECase(IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER);
+  ECase(IMAGE_SUBSYSTEM_EFI_ROM);
+  ECase(IMAGE_SUBSYSTEM_XBOX);
+  ECase(IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION);
 }
 #undef ECase
 
@@ -252,12 +260,15 @@ void ScalarBitSetTraits<COFF::DLLCharacteristics>::bitset(
 #undef BCase
 
 namespace {
+
 struct NSectionSelectionType {
   NSectionSelectionType(IO &)
       : SelectionType(COFFYAML::COMDATType(0)) {}
   NSectionSelectionType(IO &, uint8_t C)
       : SelectionType(COFFYAML::COMDATType(C)) {}
+
   uint8_t denormalize(IO &) { return SelectionType; }
+
   COFFYAML::COMDATType SelectionType;
 };
 
@@ -266,7 +277,9 @@ struct NWeakExternalCharacteristics {
       : Characteristics(COFFYAML::WeakExternalCharacteristics(0)) {}
   NWeakExternalCharacteristics(IO &, uint32_t C)
       : Characteristics(COFFYAML::WeakExternalCharacteristics(C)) {}
+
   uint32_t denormalize(IO &) { return Characteristics; }
+
   COFFYAML::WeakExternalCharacteristics Characteristics;
 };
 
@@ -275,7 +288,9 @@ struct NSectionCharacteristics {
       : Characteristics(COFF::SectionCharacteristics(0)) {}
   NSectionCharacteristics(IO &, uint32_t C)
       : Characteristics(COFF::SectionCharacteristics(C)) {}
+
   uint32_t denormalize(IO &) { return Characteristics; }
+
   COFF::SectionCharacteristics Characteristics;
 };
 
@@ -284,13 +299,16 @@ struct NAuxTokenType {
       : AuxType(COFFYAML::AuxSymbolType(0)) {}
   NAuxTokenType(IO &, uint8_t C)
       : AuxType(COFFYAML::AuxSymbolType(C)) {}
+
   uint32_t denormalize(IO &) { return AuxType; }
+
   COFFYAML::AuxSymbolType AuxType;
 };
 
 struct NStorageClass {
   NStorageClass(IO &) : StorageClass(COFF::SymbolStorageClass(0)) {}
   NStorageClass(IO &, uint8_t S) : StorageClass(COFF::SymbolStorageClass(S)) {}
+
   uint8_t denormalize(IO &) { return StorageClass; }
 
   COFF::SymbolStorageClass StorageClass;
@@ -299,7 +317,9 @@ struct NStorageClass {
 struct NMachine {
   NMachine(IO &) : Machine(COFF::MachineTypes(0)) {}
   NMachine(IO &, uint16_t M) : Machine(COFF::MachineTypes(M)) {}
+
   uint16_t denormalize(IO &) { return Machine; }
+
   COFF::MachineTypes Machine;
 };
 
@@ -307,6 +327,7 @@ struct NHeaderCharacteristics {
   NHeaderCharacteristics(IO &) : Characteristics(COFF::Characteristics(0)) {}
   NHeaderCharacteristics(IO &, uint16_t C)
       : Characteristics(COFF::Characteristics(C)) {}
+
   uint16_t denormalize(IO &) { return Characteristics; }
 
   COFF::Characteristics Characteristics;
@@ -316,13 +337,16 @@ template <typename RelocType>
 struct NType {
   NType(IO &) : Type(RelocType(0)) {}
   NType(IO &, uint16_t T) : Type(RelocType(T)) {}
+
   uint16_t denormalize(IO &) { return Type; }
+
   RelocType Type;
 };
 
 struct NWindowsSubsystem {
   NWindowsSubsystem(IO &) : Subsystem(COFF::WindowsSubsystem(0)) {}
   NWindowsSubsystem(IO &, uint16_t C) : Subsystem(COFF::WindowsSubsystem(C)) {}
+
   uint16_t denormalize(IO &) { return Subsystem; }
 
   COFF::WindowsSubsystem Subsystem;
@@ -332,12 +356,13 @@ struct NDLLCharacteristics {
   NDLLCharacteristics(IO &) : Characteristics(COFF::DLLCharacteristics(0)) {}
   NDLLCharacteristics(IO &, uint16_t C)
       : Characteristics(COFF::DLLCharacteristics(C)) {}
+
   uint16_t denormalize(IO &) { return Characteristics; }
 
   COFF::DLLCharacteristics Characteristics;
 };
 
-}
+} // end anonymous namespace
 
 void MappingTraits<COFFYAML::Relocation>::mapping(IO &IO,
                                                   COFFYAML::Relocation &Rel) {
@@ -509,5 +534,6 @@ void MappingTraits<COFFYAML::Object>::mapping(IO &IO, COFFYAML::Object &Obj) {
   IO.mapRequired("symbols", Obj.Symbols);
 }
 
-}
-}
+} // end namespace yaml
+
+} // end namespace llvm
diff --git a/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp
index d194420d5ef4..60b0ea28030a 100644
--- a/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp
@@ -13,9 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/CodeViewYAMLDebugSections.h"
-
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugCrossExSubsection.h"
@@ -24,15 +26,29 @@
 #include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h"
 #include "llvm/DebugInfo/CodeView/DebugSymbolRVASubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h"
-#include "llvm/DebugInfo/CodeView/EnumTables.h"
+#include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/ObjectYAML/CodeViewYAMLSymbols.h"
-#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
 using namespace llvm;
 using namespace llvm::codeview;
 using namespace llvm::CodeViewYAML;
@@ -48,9 +64,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeSite)
 LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeInfo)
 LLVM_YAML_IS_SEQUENCE_VECTOR(CrossModuleExport)
 LLVM_YAML_IS_SEQUENCE_VECTOR(YAMLCrossModuleImport)
-LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef)
 LLVM_YAML_IS_SEQUENCE_VECTOR(YAMLFrameData)
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
 
 LLVM_YAML_DECLARE_SCALAR_TRAITS(HexFormattedString, false)
 LLVM_YAML_DECLARE_ENUM_TRAITS(DebugSubsectionKind)
@@ -70,21 +84,25 @@ LLVM_YAML_DECLARE_MAPPING_TRAITS(InlineeSite)
 namespace llvm {
 namespace CodeViewYAML {
 namespace detail {
+
 struct YAMLSubsectionBase {
   explicit YAMLSubsectionBase(DebugSubsectionKind Kind) : Kind(Kind) {}
-  DebugSubsectionKind Kind;
-  virtual ~YAMLSubsectionBase() {}
+  virtual ~YAMLSubsectionBase() = default;
 
   virtual void map(IO &IO) = 0;
   virtual std::shared_ptr<DebugSubsection>
   toCodeViewSubsection(BumpPtrAllocator &Allocator,
                        const codeview::StringsAndChecksums &SC) const = 0;
+
+  DebugSubsectionKind Kind;
 };
-}
-}
-}
+
+} // end namespace detail
+} // end namespace CodeViewYAML
+} // end namespace llvm
 
 namespace {
+
 struct YAMLChecksumsSubsection : public YAMLSubsectionBase {
   YAMLChecksumsSubsection()
       : YAMLSubsectionBase(DebugSubsectionKind::FileChecksums) {}
@@ -215,7 +233,8 @@ struct YAMLCoffSymbolRVASubsection : public YAMLSubsectionBase {
 
   std::vector<uint32_t> RVAs;
 };
-}
+
+} // end anonymous namespace
 
 void ScalarBitSetTraits<LineFlags>::bitset(IO &io, LineFlags &Flags) {
   io.bitSetCase(Flags, "HasColumnInfo", LF_HaveColumns);
@@ -743,8 +762,9 @@ llvm::CodeViewYAML::toCodeViewSubsectionList(
 }
 
 namespace {
+
 struct SubsectionConversionVisitor : public DebugSubsectionVisitor {
-  SubsectionConversionVisitor() {}
+  SubsectionConversionVisitor() = default;
 
   Error visitUnknown(DebugUnknownSubsectionRef &Unknown) override;
   Error visitLines(DebugLinesSubsectionRef &Lines,
@@ -769,6 +789,8 @@ struct SubsectionConversionVisitor : public DebugSubsectionVisitor {
   YAMLDebugSubsection Subsection;
 };
 
+} // end anonymous namespace
+
 Error SubsectionConversionVisitor::visitUnknown(
     DebugUnknownSubsectionRef &Unknown) {
   return make_error<CodeViewError>(cv_error_code::operation_unsupported);
@@ -865,7 +887,6 @@ Error SubsectionConversionVisitor::visitCOFFSymbolRVAs(
   Subsection.Subsection = *Result;
   return Error::success();
 }
-}
 
 Expected<YAMLDebugSubsection>
 YAMLDebugSubsection::fromCodeViewSubection(const StringsAndChecksumsRef &SC,
diff --git a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
index 83f3d55b8e55..dbe4e2a6d6fd 100644
--- a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -13,13 +13,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/CodeViewYAMLSymbols.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -27,7 +39,6 @@ using namespace llvm::CodeViewYAML;
 using namespace llvm::CodeViewYAML::detail;
 using namespace llvm::yaml;
 
-LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef)
 LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex)
 
 // We only need to declare these, the definitions are in CodeViewYAMLTypes.cpp
@@ -49,15 +60,16 @@ LLVM_YAML_DECLARE_ENUM_TRAITS(RegisterId)
 LLVM_YAML_DECLARE_ENUM_TRAITS(TrampolineType)
 LLVM_YAML_DECLARE_ENUM_TRAITS(ThunkOrdinal)
 
-LLVM_YAML_STRONG_TYPEDEF(llvm::StringRef, TypeName)
+LLVM_YAML_STRONG_TYPEDEF(StringRef, TypeName)
 
 LLVM_YAML_DECLARE_SCALAR_TRAITS(TypeName, true)
 
 StringRef ScalarTraits<TypeName>::input(StringRef S, void *V, TypeName &T) {
   return ScalarTraits<StringRef>::input(S, V, T.value);
 }
+
 void ScalarTraits<TypeName>::output(const TypeName &T, void *V,
-                                    llvm::raw_ostream &R) {
+                                    raw_ostream &R) {
   ScalarTraits<StringRef>::output(T.value, V, R);
 }
 
@@ -174,9 +186,10 @@ namespace detail {
 
 struct SymbolRecordBase {
   codeview::SymbolKind Kind;
+
   explicit SymbolRecordBase(codeview::SymbolKind K) : Kind(K) {}
+  virtual ~SymbolRecordBase() = default;
 
-  virtual ~SymbolRecordBase() {}
   virtual void map(yaml::IO &io) = 0;
   virtual codeview::CVSymbol
   toCodeViewSymbol(BumpPtrAllocator &Allocator,
@@ -195,6 +208,7 @@ template <typename T> struct SymbolRecordImpl : public SymbolRecordBase {
                    CodeViewContainer Container) const override {
     return SymbolSerializer::writeOneSymbol(Symbol, Allocator, Container);
   }
+
   Error fromCodeViewSymbol(codeview::CVSymbol CVS) override {
     return SymbolDeserializer::deserializeAs<T>(CVS, Symbol);
   }
@@ -218,6 +232,7 @@ struct UnknownSymbolRecord : public SymbolRecordBase {
     ::memcpy(Buffer + sizeof(RecordPrefix), Data.data(), Data.size());
     return CVSymbol(Kind, ArrayRef<uint8_t>(Buffer, TotalLen));
   }
+
   Error fromCodeViewSymbol(CVSymbol CVS) override {
     this->Kind = CVS.kind();
     Data = CVS.RecordData.drop_front(sizeof(RecordPrefix));
@@ -497,9 +512,10 @@ template <> void SymbolRecordImpl<ThreadLocalDataSym>::map(IO &IO) {
   IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("DisplayName", Symbol.Name);
 }
-}
-}
-}
+
+} // end namespace detail
+} // end namespace CodeViewYAML
+} // end namespace llvm
 
 CVSymbol CodeViewYAML::SymbolRecord::toCodeViewSymbol(
     BumpPtrAllocator &Allocator, CodeViewContainer Container) const {
@@ -508,11 +524,13 @@ CVSymbol CodeViewYAML::SymbolRecord::toCodeViewSymbol(
 
 namespace llvm {
 namespace yaml {
+
 template <> struct MappingTraits<SymbolRecordBase> {
   static void mapping(IO &io, SymbolRecordBase &Record) { Record.map(io); }
 };
-}
-}
+
+} // end namespace yaml
+} // end namespace llvm
 
 template <typename SymbolType>
 static inline Expected<CodeViewYAML::SymbolRecord>
diff --git a/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/lib/ObjectYAML/CodeViewYAMLTypes.cpp
index 2d1cb4b1b27b..0b2ea61c5fe0 100644
--- a/lib/ObjectYAML/CodeViewYAMLTypes.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLTypes.cpp
@@ -13,14 +13,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/CodeViewYAMLTypes.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
-#include "llvm/DebugInfo/CodeView/EnumTables.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -29,7 +44,6 @@ using namespace llvm::CodeViewYAML::detail;
 using namespace llvm::yaml;
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(OneMethodRecord)
-LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef)
 LLVM_YAML_IS_SEQUENCE_VECTOR(VFTableSlotKind)
 LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex)
 
@@ -63,9 +77,10 @@ namespace detail {
 
 struct LeafRecordBase {
   TypeLeafKind Kind;
+
   explicit LeafRecordBase(TypeLeafKind K) : Kind(K) {}
+  virtual ~LeafRecordBase() = default;
 
-  virtual ~LeafRecordBase() {}
   virtual void map(yaml::IO &io) = 0;
   virtual CVType toCodeViewRecord(TypeTableBuilder &TTB) const = 0;
   virtual Error fromCodeViewRecord(CVType Type) = 0;
@@ -101,9 +116,10 @@ template <> struct LeafRecordImpl<FieldListRecord> : public LeafRecordBase {
 
 struct MemberRecordBase {
   TypeLeafKind Kind;
+
   explicit MemberRecordBase(TypeLeafKind K) : Kind(K) {}
+  virtual ~MemberRecordBase() = default;
 
-  virtual ~MemberRecordBase() {}
   virtual void map(yaml::IO &io) = 0;
   virtual void writeTo(FieldListRecordBuilder &FLRB) = 0;
 };
@@ -111,6 +127,7 @@ struct MemberRecordBase {
 template <typename T> struct MemberRecordImpl : public MemberRecordBase {
   explicit MemberRecordImpl(TypeLeafKind K)
       : MemberRecordBase(K), Record(static_cast<TypeRecordKind>(K)) {}
+
   void map(yaml::IO &io) override;
 
   void writeTo(FieldListRecordBuilder &FLRB) override {
@@ -119,12 +136,13 @@ template <typename T> struct MemberRecordImpl : public MemberRecordBase {
 
   mutable T Record;
 };
-}
-}
-}
+
+} // end namespace detail
+} // end namespace CodeViewYAML
+} // end namespace llvm
 
 void ScalarTraits<TypeIndex>::output(const TypeIndex &S, void *,
-                                     llvm::raw_ostream &OS) {
+                                     raw_ostream &OS) {
   OS << S.getIndex();
 }
 
@@ -136,8 +154,7 @@ StringRef ScalarTraits<TypeIndex>::input(StringRef Scalar, void *Ctx,
   return Result;
 }
 
-void ScalarTraits<APSInt>::output(const APSInt &S, void *,
-                                  llvm::raw_ostream &OS) {
+void ScalarTraits<APSInt>::output(const APSInt &S, void *, raw_ostream &OS) {
   S.print(OS, S.isSigned());
 }
 
@@ -346,6 +363,7 @@ void MappingTraits<MemberPointerInfo>::mapping(IO &IO, MemberPointerInfo &MPI) {
 namespace llvm {
 namespace CodeViewYAML {
 namespace detail {
+
 template <> void LeafRecordImpl<ModifierRecord>::map(IO &IO) {
   IO.mapRequired("ModifiedType", Record.ModifiedType);
   IO.mapRequired("Modifiers", Record.Modifiers);
@@ -404,11 +422,13 @@ template <> void LeafRecordImpl<ArrayRecord>::map(IO &IO) {
 void LeafRecordImpl<FieldListRecord>::map(IO &IO) {
   IO.mapRequired("FieldList", Members);
 }
-}
-}
-}
+
+} // end namespace detail
+} // end namespace CodeViewYAML
+} // end namespace llvm
 
 namespace {
+
 class MemberRecordConversionVisitor : public TypeVisitorCallbacks {
 public:
   explicit MemberRecordConversionVisitor(std::vector<MemberRecord> &Records)
@@ -433,7 +453,8 @@ private:
 
   std::vector<MemberRecord> &Records;
 };
-}
+
+} // end anonymous namespace
 
 Error LeafRecordImpl<FieldListRecord>::fromCodeViewRecord(CVType Type) {
   MemberRecordConversionVisitor V(Members);
@@ -461,13 +482,13 @@ void MappingTraits<OneMethodRecord>::mapping(IO &io, OneMethodRecord &Record) {
 namespace llvm {
 namespace CodeViewYAML {
 namespace detail {
+
 template <> void LeafRecordImpl<ClassRecord>::map(IO &IO) {
   IO.mapRequired("MemberCount", Record.MemberCount);
   IO.mapRequired("Options", Record.Options);
   IO.mapRequired("FieldList", Record.FieldList);
   IO.mapRequired("Name", Record.Name);
   IO.mapRequired("UniqueName", Record.UniqueName);
-
   IO.mapRequired("DerivationList", Record.DerivationList);
   IO.mapRequired("VTableShape", Record.VTableShape);
   IO.mapRequired("Size", Record.Size);
@@ -479,7 +500,6 @@ template <> void LeafRecordImpl<UnionRecord>::map(IO &IO) {
   IO.mapRequired("FieldList", Record.FieldList);
   IO.mapRequired("Name", Record.Name);
   IO.mapRequired("UniqueName", Record.UniqueName);
-
   IO.mapRequired("Size", Record.Size);
 }
 
@@ -489,7 +509,6 @@ template <> void LeafRecordImpl<EnumRecord>::map(IO &IO) {
   IO.mapRequired("FieldList", Record.FieldList);
   IO.mapRequired("Name", Record.Name);
   IO.mapRequired("UniqueName", Record.UniqueName);
-
   IO.mapRequired("UnderlyingType", Record.UnderlyingType);
 }
 
@@ -603,9 +622,10 @@ template <> void MemberRecordImpl<VirtualBaseClassRecord>::map(IO &IO) {
 template <> void MemberRecordImpl<ListContinuationRecord>::map(IO &IO) {
   IO.mapRequired("ContinuationIndex", Record.ContinuationIndex);
 }
-}
-}
-}
+
+} // end namespace detail
+} // end namespace CodeViewYAML
+} // end namespace llvm
 
 template <typename T>
 static inline Expected<LeafRecord> fromCodeViewRecordImpl(CVType Type) {
@@ -628,7 +648,8 @@ Expected<LeafRecord> LeafRecord::fromCodeViewRecord(CVType Type) {
 #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName)
   switch (Type.kind()) {
 #include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
-  default: { llvm_unreachable("Unknown leaf kind!"); }
+  default:
+      llvm_unreachable("Unknown leaf kind!");
   }
   return make_error<CodeViewError>(cv_error_code::corrupt_record);
 }
@@ -644,6 +665,7 @@ CVType LeafRecord::toCodeViewRecord(TypeTableBuilder &TTB) const {
 
 namespace llvm {
 namespace yaml {
+
 template <> struct MappingTraits<LeafRecordBase> {
   static void mapping(IO &io, LeafRecordBase &Record) { Record.map(io); }
 };
@@ -651,8 +673,9 @@ template <> struct MappingTraits<LeafRecordBase> {
 template <> struct MappingTraits<MemberRecordBase> {
   static void mapping(IO &io, MemberRecordBase &Record) { Record.map(io); }
 };
-}
-}
+
+} // end namespace yaml
+} // end namespace llvm
 
 template <typename ConcreteType>
 static void mapLeafRecordImpl(IO &IO, const char *Class, TypeLeafKind Kind,
diff --git a/lib/ObjectYAML/DWARFEmitter.cpp b/lib/ObjectYAML/DWARFEmitter.cpp
index 91c928771a65..89fc652035ca 100644
--- a/lib/ObjectYAML/DWARFEmitter.cpp
+++ b/lib/ObjectYAML/DWARFEmitter.cpp
@@ -13,15 +13,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/DWARFEmitter.h"
+#include "DWARFVisitor.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ObjectYAML/DWARFYAML.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
-
-#include "DWARFVisitor.h"
-
 #include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
 
 using namespace llvm;
 
@@ -127,7 +137,7 @@ class DumpVisitor : public DWARFYAML::ConstVisitor {
   raw_ostream &OS;
 
 protected:
-  virtual void onStartCompileUnit(const DWARFYAML::Unit &CU) {
+  void onStartCompileUnit(const DWARFYAML::Unit &CU) override {
     writeInitialLength(CU.Length, OS, DebugInfo.IsLittleEndian);
     writeInteger((uint16_t)CU.Version, OS, DebugInfo.IsLittleEndian);
     if(CU.Version >= 5) {
@@ -141,41 +151,43 @@ protected:
     
   }
 
-  virtual void onStartDIE(const DWARFYAML::Unit &CU,
-                          const DWARFYAML::Entry &DIE) {
+  void onStartDIE(const DWARFYAML::Unit &CU,
+                  const DWARFYAML::Entry &DIE) override {
     encodeULEB128(DIE.AbbrCode, OS);
   }
 
-  virtual void onValue(const uint8_t U) {
+  void onValue(const uint8_t U) override {
     writeInteger(U, OS, DebugInfo.IsLittleEndian);
   }
 
-  virtual void onValue(const uint16_t U) {
+  void onValue(const uint16_t U) override {
     writeInteger(U, OS, DebugInfo.IsLittleEndian);
   }
-  virtual void onValue(const uint32_t U) {
+
+  void onValue(const uint32_t U) override {
     writeInteger(U, OS, DebugInfo.IsLittleEndian);
   }
-  virtual void onValue(const uint64_t U, const bool LEB = false) {
+
+  void onValue(const uint64_t U, const bool LEB = false) override {
     if (LEB)
       encodeULEB128(U, OS);
     else
       writeInteger(U, OS, DebugInfo.IsLittleEndian);
   }
 
-  virtual void onValue(const int64_t S, const bool LEB = false) {
+  void onValue(const int64_t S, const bool LEB = false) override {
     if (LEB)
       encodeSLEB128(S, OS);
     else
       writeInteger(S, OS, DebugInfo.IsLittleEndian);
   }
 
-  virtual void onValue(const StringRef String) {
+  void onValue(const StringRef String) override {
     OS.write(String.data(), String.size());
     OS.write('\0');
   }
 
-  virtual void onValue(const MemoryBufferRef MBR) {
+  void onValue(const MemoryBufferRef MBR) override {
     OS.write(MBR.getBufferStart(), MBR.getBufferSize());
   }
 
@@ -280,7 +292,7 @@ void DWARFYAML::EmitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
   }
 }
 
-typedef void (*EmitFuncType)(raw_ostream &, const DWARFYAML::Data &);
+using EmitFuncType = void (*)(raw_ostream &, const DWARFYAML::Data &);
 
 static void
 EmitDebugSectionImpl(const DWARFYAML::Data &DI, EmitFuncType EmitFunc,
diff --git a/lib/ObjectYAML/DWARFYAML.cpp b/lib/ObjectYAML/DWARFYAML.cpp
index edb9545f14b1..d6c09e1a35d7 100644
--- a/lib/ObjectYAML/DWARFYAML.cpp
+++ b/lib/ObjectYAML/DWARFYAML.cpp
@@ -171,6 +171,6 @@ void MappingTraits<DWARFYAML::InitialLength>::mapping(
     IO.mapRequired("TotalLength64", InitialLength.TotalLength64);
 }
 
-} // namespace llvm::yaml
+} // end namespace yaml
 
-} // namespace llvm
+} // end namespace llvm
diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp
index dbd5498e003d..39741dab327a 100644
--- a/lib/ObjectYAML/ELFYAML.cpp
+++ b/lib/ObjectYAML/ELFYAML.cpp
@@ -12,12 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/ELFYAML.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MipsABIFlags.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <cassert>
+#include <cstdint>
 
 namespace llvm {
 
-ELFYAML::Section::~Section() {}
+ELFYAML::Section::~Section() = default;
 
 namespace yaml {
 
@@ -542,6 +548,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
     llvm_unreachable("Unsupported architecture");
   }
 #undef ELF_RELOC
+  IO.enumFallback<Hex32>(Value);
 }
 
 void ScalarEnumerationTraits<ELFYAML::MIPS_AFL_REG>::enumeration(
@@ -643,6 +650,7 @@ void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO,
 }
 
 namespace {
+
 struct NormalizedOther {
   NormalizedOther(IO &)
       : Visibility(ELFYAML::ELF_STV(0)), Other(ELFYAML::ELF_STO(0)) {}
@@ -654,7 +662,8 @@ struct NormalizedOther {
   ELFYAML::ELF_STV Visibility;
   ELFYAML::ELF_STO Other;
 };
-}
+
+} // end anonymous namespace
 
 void MappingTraits<ELFYAML::Symbol>::mapping(IO &IO, ELFYAML::Symbol &Symbol) {
   IO.mapOptional("Name", Symbol.Name, StringRef());
@@ -777,6 +786,7 @@ StringRef MappingTraits<std::unique_ptr<ELFYAML::Section>>::validate(
 }
 
 namespace {
+
 struct NormalizedMips64RelType {
   NormalizedMips64RelType(IO &)
       : Type(ELFYAML::ELF_REL(ELF::R_MIPS_NONE)),
@@ -797,7 +807,8 @@ struct NormalizedMips64RelType {
   ELFYAML::ELF_REL Type3;
   ELFYAML::ELF_RSS SpecSym;
 };
-}
+
+} // end anonymous namespace
 
 void MappingTraits<ELFYAML::Relocation>::mapping(IO &IO,
                                                  ELFYAML::Relocation &Rel) {
@@ -838,4 +849,5 @@ LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_ASE)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_FLAGS1)
 
 } // end namespace yaml
+
 } // end namespace llvm
diff --git a/lib/ObjectYAML/MachOYAML.cpp b/lib/ObjectYAML/MachOYAML.cpp
index 461684827872..ab452a7bf6ef 100644
--- a/lib/ObjectYAML/MachOYAML.cpp
+++ b/lib/ObjectYAML/MachOYAML.cpp
@@ -12,16 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/MachOYAML.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MachO.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
-
-#include <string.h> // For memcpy, memset and strnlen.
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
+#include <cstring>
 
 namespace llvm {
 
-MachOYAML::LoadCommand::~LoadCommand() {}
+MachOYAML::LoadCommand::~LoadCommand() = default;
 
 bool MachOYAML::LinkEditData::isEmpty() const {
   return 0 ==
@@ -33,7 +36,7 @@ bool MachOYAML::LinkEditData::isEmpty() const {
 namespace yaml {
 
 void ScalarTraits<char_16>::output(const char_16 &Val, void *,
-                                   llvm::raw_ostream &Out) {
+                                   raw_ostream &Out) {
   auto Len = strnlen(&Val[0], 16);
   Out << StringRef(&Val[0], Len);
 }
@@ -51,8 +54,7 @@ StringRef ScalarTraits<char_16>::input(StringRef Scalar, void *, char_16 &Val) {
 
 bool ScalarTraits<char_16>::mustQuote(StringRef S) { return needsQuotes(S); }
 
-void ScalarTraits<uuid_t>::output(const uuid_t &Val, void *,
-                                  llvm::raw_ostream &Out) {
+void ScalarTraits<uuid_t>::output(const uuid_t &Val, void *, raw_ostream &Out) {
   for (int Idx = 0; Idx < 16; ++Idx) {
     Out << format("%02" PRIX32, Val[Idx]);
     if (Idx == 3 || Idx == 5 || Idx == 7 || Idx == 9)
@@ -154,7 +156,7 @@ void MappingTraits<MachOYAML::LinkEditData>::mapping(
   IO.mapOptional("BindOpcodes", LinkEditData.BindOpcodes);
   IO.mapOptional("WeakBindOpcodes", LinkEditData.WeakBindOpcodes);
   IO.mapOptional("LazyBindOpcodes", LinkEditData.LazyBindOpcodes);
-  if(LinkEditData.ExportTrie.Children.size() > 0 || !IO.outputting())
+  if (!LinkEditData.ExportTrie.Children.empty() || !IO.outputting())
     IO.mapOptional("ExportTrie", LinkEditData.ExportTrie);
   IO.mapOptional("NameList", LinkEditData.NameList);
   IO.mapOptional("StringTable", LinkEditData.StringTable);
@@ -308,13 +310,11 @@ void MappingTraits<MachO::dylib_command>::mapping(
 
 void MappingTraits<MachO::dylinker_command>::mapping(
     IO &IO, MachO::dylinker_command &LoadCommand) {
-
   IO.mapRequired("name", LoadCommand.name);
 }
 
 void MappingTraits<MachO::dysymtab_command>::mapping(
     IO &IO, MachO::dysymtab_command &LoadCommand) {
-
   IO.mapRequired("ilocalsym", LoadCommand.ilocalsym);
   IO.mapRequired("nlocalsym", LoadCommand.nlocalsym);
   IO.mapRequired("iextdefsym", LoadCommand.iextdefsym);
@@ -337,7 +337,6 @@ void MappingTraits<MachO::dysymtab_command>::mapping(
 
 void MappingTraits<MachO::encryption_info_command>::mapping(
     IO &IO, MachO::encryption_info_command &LoadCommand) {
-
   IO.mapRequired("cryptoff", LoadCommand.cryptoff);
   IO.mapRequired("cryptsize", LoadCommand.cryptsize);
   IO.mapRequired("cryptid", LoadCommand.cryptid);
@@ -345,7 +344,6 @@ void MappingTraits<MachO::encryption_info_command>::mapping(
 
 void MappingTraits<MachO::encryption_info_command_64>::mapping(
     IO &IO, MachO::encryption_info_command_64 &LoadCommand) {
-
   IO.mapRequired("cryptoff", LoadCommand.cryptoff);
   IO.mapRequired("cryptsize", LoadCommand.cryptsize);
   IO.mapRequired("cryptid", LoadCommand.cryptid);
@@ -354,14 +352,12 @@ void MappingTraits<MachO::encryption_info_command_64>::mapping(
 
 void MappingTraits<MachO::entry_point_command>::mapping(
     IO &IO, MachO::entry_point_command &LoadCommand) {
-
   IO.mapRequired("entryoff", LoadCommand.entryoff);
   IO.mapRequired("stacksize", LoadCommand.stacksize);
 }
 
 void MappingTraits<MachO::fvmfile_command>::mapping(
     IO &IO, MachO::fvmfile_command &LoadCommand) {
-
   IO.mapRequired("name", LoadCommand.name);
   IO.mapRequired("header_addr", LoadCommand.header_addr);
 }
@@ -374,7 +370,6 @@ void MappingTraits<MachO::fvmlib>::mapping(IO &IO, MachO::fvmlib &FVMLib) {
 
 void MappingTraits<MachO::fvmlib_command>::mapping(
     IO &IO, MachO::fvmlib_command &LoadCommand) {
-
   IO.mapRequired("fvmlib", LoadCommand.fvmlib);
 }
 
@@ -383,20 +378,17 @@ void MappingTraits<MachO::ident_command>::mapping(
 
 void MappingTraits<MachO::linkedit_data_command>::mapping(
     IO &IO, MachO::linkedit_data_command &LoadCommand) {
-
   IO.mapRequired("dataoff", LoadCommand.dataoff);
   IO.mapRequired("datasize", LoadCommand.datasize);
 }
 
 void MappingTraits<MachO::linker_option_command>::mapping(
     IO &IO, MachO::linker_option_command &LoadCommand) {
-
   IO.mapRequired("count", LoadCommand.count);
 }
 
 void MappingTraits<MachO::prebind_cksum_command>::mapping(
     IO &IO, MachO::prebind_cksum_command &LoadCommand) {
-
   IO.mapRequired("cksum", LoadCommand.cksum);
 }
 
@@ -405,7 +397,6 @@ void MappingTraits<MachO::load_command>::mapping(
 
 void MappingTraits<MachO::prebound_dylib_command>::mapping(
     IO &IO, MachO::prebound_dylib_command &LoadCommand) {
-
   IO.mapRequired("name", LoadCommand.name);
   IO.mapRequired("nmodules", LoadCommand.nmodules);
   IO.mapRequired("linked_modules", LoadCommand.linked_modules);
@@ -413,7 +404,6 @@ void MappingTraits<MachO::prebound_dylib_command>::mapping(
 
 void MappingTraits<MachO::routines_command>::mapping(
     IO &IO, MachO::routines_command &LoadCommand) {
-
   IO.mapRequired("init_address", LoadCommand.init_address);
   IO.mapRequired("init_module", LoadCommand.init_module);
   IO.mapRequired("reserved1", LoadCommand.reserved1);
@@ -426,7 +416,6 @@ void MappingTraits<MachO::routines_command>::mapping(
 
 void MappingTraits<MachO::routines_command_64>::mapping(
     IO &IO, MachO::routines_command_64 &LoadCommand) {
-
   IO.mapRequired("init_address", LoadCommand.init_address);
   IO.mapRequired("init_module", LoadCommand.init_module);
   IO.mapRequired("reserved1", LoadCommand.reserved1);
@@ -439,7 +428,6 @@ void MappingTraits<MachO::routines_command_64>::mapping(
 
 void MappingTraits<MachO::rpath_command>::mapping(
     IO &IO, MachO::rpath_command &LoadCommand) {
-
   IO.mapRequired("path", LoadCommand.path);
 }
 
@@ -475,7 +463,6 @@ void MappingTraits<MachO::section_64>::mapping(IO &IO,
 
 void MappingTraits<MachO::segment_command>::mapping(
     IO &IO, MachO::segment_command &LoadCommand) {
-
   IO.mapRequired("segname", LoadCommand.segname);
   IO.mapRequired("vmaddr", LoadCommand.vmaddr);
   IO.mapRequired("vmsize", LoadCommand.vmsize);
@@ -489,7 +476,6 @@ void MappingTraits<MachO::segment_command>::mapping(
 
 void MappingTraits<MachO::segment_command_64>::mapping(
     IO &IO, MachO::segment_command_64 &LoadCommand) {
-
   IO.mapRequired("segname", LoadCommand.segname);
   IO.mapRequired("vmaddr", LoadCommand.vmaddr);
   IO.mapRequired("vmsize", LoadCommand.vmsize);
@@ -503,44 +489,37 @@ void MappingTraits<MachO::segment_command_64>::mapping(
 
 void MappingTraits<MachO::source_version_command>::mapping(
     IO &IO, MachO::source_version_command &LoadCommand) {
-
   IO.mapRequired("version", LoadCommand.version);
 }
 
 void MappingTraits<MachO::sub_client_command>::mapping(
     IO &IO, MachO::sub_client_command &LoadCommand) {
-
   IO.mapRequired("client", LoadCommand.client);
 }
 
 void MappingTraits<MachO::sub_framework_command>::mapping(
     IO &IO, MachO::sub_framework_command &LoadCommand) {
-
   IO.mapRequired("umbrella", LoadCommand.umbrella);
 }
 
 void MappingTraits<MachO::sub_library_command>::mapping(
     IO &IO, MachO::sub_library_command &LoadCommand) {
-
   IO.mapRequired("sub_library", LoadCommand.sub_library);
 }
 
 void MappingTraits<MachO::sub_umbrella_command>::mapping(
     IO &IO, MachO::sub_umbrella_command &LoadCommand) {
-
   IO.mapRequired("sub_umbrella", LoadCommand.sub_umbrella);
 }
 
 void MappingTraits<MachO::symseg_command>::mapping(
     IO &IO, MachO::symseg_command &LoadCommand) {
-
   IO.mapRequired("offset", LoadCommand.offset);
   IO.mapRequired("size", LoadCommand.size);
 }
 
 void MappingTraits<MachO::symtab_command>::mapping(
     IO &IO, MachO::symtab_command &LoadCommand) {
-
   IO.mapRequired("symoff", LoadCommand.symoff);
   IO.mapRequired("nsyms", LoadCommand.nsyms);
   IO.mapRequired("stroff", LoadCommand.stroff);
@@ -552,27 +531,23 @@ void MappingTraits<MachO::thread_command>::mapping(
 
 void MappingTraits<MachO::twolevel_hints_command>::mapping(
     IO &IO, MachO::twolevel_hints_command &LoadCommand) {
-
   IO.mapRequired("offset", LoadCommand.offset);
   IO.mapRequired("nhints", LoadCommand.nhints);
 }
 
 void MappingTraits<MachO::uuid_command>::mapping(
     IO &IO, MachO::uuid_command &LoadCommand) {
-
   IO.mapRequired("uuid", LoadCommand.uuid);
 }
 
 void MappingTraits<MachO::version_min_command>::mapping(
     IO &IO, MachO::version_min_command &LoadCommand) {
-
   IO.mapRequired("version", LoadCommand.version);
   IO.mapRequired("sdk", LoadCommand.sdk);
 }
 
 void MappingTraits<MachO::note_command>::mapping(
     IO &IO, MachO::note_command &LoadCommand) {
-
   IO.mapRequired("data_owner", LoadCommand.data_owner);
   IO.mapRequired("offset", LoadCommand.offset);
   IO.mapRequired("size", LoadCommand.size);
@@ -580,13 +555,12 @@ void MappingTraits<MachO::note_command>::mapping(
 
 void MappingTraits<MachO::build_version_command>::mapping(
     IO &IO, MachO::build_version_command &LoadCommand) {
-
   IO.mapRequired("platform", LoadCommand.platform);
   IO.mapRequired("minos", LoadCommand.minos);
   IO.mapRequired("sdk", LoadCommand.sdk);
   IO.mapRequired("ntools", LoadCommand.ntools);
 }
 
-} // namespace llvm::yaml
+} // end namespace yaml
 
-} // namespace llvm
+} // end namespace llvm
diff --git a/lib/ObjectYAML/ObjectYAML.cpp b/lib/ObjectYAML/ObjectYAML.cpp
index 4b7154ebb7c1..850c1a5a06c0 100644
--- a/lib/ObjectYAML/ObjectYAML.cpp
+++ b/lib/ObjectYAML/ObjectYAML.cpp
@@ -12,7 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/ObjectYAML.h"
-#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <string>
 
 using namespace llvm;
 using namespace yaml;
@@ -53,8 +56,8 @@ void MappingTraits<YamlObjectFile>::mapping(IO &IO,
         IO.setError("YAML Object File missing document type tag!");
       else
         IO.setError(
-            llvm::Twine("YAML Object File unsupported document type tag '") +
-            llvm::Twine(Tag) + llvm::Twine("'!"));
+            Twine("YAML Object File unsupported document type tag '") +
+            Twine(Tag) + Twine("'!"));
     }
   }
 }
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
index 65703c6cf683..2040efdc9d11 100644
--- a/lib/ObjectYAML/WasmYAML.cpp
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -12,9 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/WasmYAML.h"
-#include "llvm/Object/Wasm.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/MipsABIFlags.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/YAMLTraits.h"
 
 namespace llvm {
 
@@ -22,7 +23,7 @@ namespace WasmYAML {
 
 // Declared here rather than in the header to comply with:
 // http://llvm.org/docs/CodingStandards.html#provide-a-virtual-method-anchor-for-classes-in-headers
-Section::~Section() {}
+Section::~Section() = default;
 
 } // end namespace WasmYAML
 
@@ -56,6 +57,8 @@ static void sectionMapping(IO &IO, WasmYAML::NameSection &Section) {
 static void sectionMapping(IO &IO, WasmYAML::LinkingSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapRequired("Name", Section.Name);
+  IO.mapRequired("DataSize", Section.DataSize);
+  IO.mapRequired("DataAlignment", Section.DataAlignment);
   IO.mapRequired("SymbolInfo", Section.SymbolInfos);
 }
 
@@ -403,4 +406,5 @@ void ScalarEnumerationTraits<WasmYAML::RelocType>::enumeration(
 }
 
 } // end namespace yaml
+
 } // end namespace llvm
diff --git a/lib/ObjectYAML/YAML.cpp b/lib/ObjectYAML/YAML.cpp
index 75cf1fbccc80..67b5764eadaa 100644
--- a/lib/ObjectYAML/YAML.cpp
+++ b/lib/ObjectYAML/YAML.cpp
@@ -16,11 +16,12 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
+#include <cstdint>
 
 using namespace llvm;
 
 void yaml::ScalarTraits<yaml::BinaryRef>::output(
-    const yaml::BinaryRef &Val, void *, llvm::raw_ostream &Out) {
+    const yaml::BinaryRef &Val, void *, raw_ostream &Out) {
   Val.writeAsHex(Out);
 }
 
@@ -34,7 +35,7 @@ StringRef yaml::ScalarTraits<yaml::BinaryRef>::input(StringRef Scalar, void *,
     if (!isxdigit(Scalar[I]))
       return "BinaryRef hex string must contain only hex digits.";
   Val = yaml::BinaryRef(Scalar);
-  return StringRef();
+  return {};
 }
 
 void yaml::BinaryRef::writeAsBinary(raw_ostream &OS) const {
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 78d5ea955e64..0380bd991d71 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -161,8 +161,8 @@ static cl::opt<bool>
               cl::desc("Run NewGVN instead of GVN"));
 
 static cl::opt<bool> EnableEarlyCSEMemSSA(
-    "enable-npm-earlycse-memssa", cl::init(false), cl::Hidden,
-    cl::desc("Enable the EarlyCSE w/ MemorySSA pass for the new PM (default = off)"));
+    "enable-npm-earlycse-memssa", cl::init(true), cl::Hidden,
+    cl::desc("Enable the EarlyCSE w/ MemorySSA pass for the new PM (default = on)"));
 
 static cl::opt<bool> EnableGVNHoist(
     "enable-npm-gvn-hoist", cl::init(false), cl::Hidden,
@@ -480,6 +480,14 @@ static void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
     MPM.addPass(PGOInstrumentationUse(ProfileUseFile));
 }
 
+static InlineParams
+getInlineParamsFromOptLevel(PassBuilder::OptimizationLevel Level) {
+  auto O3 = PassBuilder::O3;
+  unsigned OptLevel = Level > O3 ? 2 : Level;
+  unsigned SizeLevel = Level > O3 ? Level - O3 : 0;
+  return getInlineParams(OptLevel, SizeLevel);
+}
+
 ModulePassManager
 PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
                                                bool DebugLogging) {
@@ -527,13 +535,17 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
 
   // Add all the requested passes for PGO, if requested.
   if (PGOOpt) {
-    assert(PGOOpt->RunProfileGen || PGOOpt->SamplePGO ||
+    assert(PGOOpt->RunProfileGen || !PGOOpt->SampleProfileFile.empty() ||
            !PGOOpt->ProfileUseFile.empty());
-    addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen,
-                      PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile);
+    if (PGOOpt->SampleProfileFile.empty())
+      addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen,
+                        PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile);
+    else
+      MPM.addPass(SampleProfileLoaderPass(PGOOpt->SampleProfileFile));
 
     // Indirect call promotion that promotes intra-module targes only.
-    MPM.addPass(PGOIndirectCallPromotion(false, PGOOpt && PGOOpt->SamplePGO));
+    MPM.addPass(PGOIndirectCallPromotion(
+        false, PGOOpt && !PGOOpt->SampleProfileFile.empty()));
   }
 
   // Require the GlobalsAA analysis for the module so we can query it within
@@ -558,8 +570,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   // Run the inliner first. The theory is that we are walking bottom-up and so
   // the callees have already been fully optimized, and we want to inline them
   // into the callers so that our optimizations can reflect that.
-  // FIXME; Customize the threshold based on optimization level.
-  MainCGPipeline.addPass(InlinerPass());
+  MainCGPipeline.addPass(InlinerPass(getInlineParamsFromOptLevel(Level)));
 
   // Now deduce any function attributes based in the current code.
   MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
@@ -751,9 +762,6 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level,
   // Reduce the size of the IR as much as possible.
   MPM.addPass(GlobalOptPass());
 
-  // Rename anon globals to be able to export them in the summary.
-  MPM.addPass(NameAnonGlobalPass());
-
   return MPM;
 }
 
@@ -772,9 +780,9 @@ PassBuilder::buildThinLTODefaultPipeline(OptimizationLevel Level,
   // During the ThinLTO backend phase we perform early indirect call promotion
   // here, before globalopt. Otherwise imported available_externally functions
   // look unreferenced and are removed.
-  MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */,
-                                       PGOOpt && PGOOpt->SamplePGO &&
-                                           !PGOOpt->ProfileUseFile.empty()));
+  MPM.addPass(PGOIndirectCallPromotion(
+      true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty() &&
+                            !PGOOpt->ProfileUseFile.empty()));
 
   // Add the core simplification pipeline.
   MPM.addPass(buildModuleSimplificationPipeline(Level, DebugLogging));
@@ -814,8 +822,8 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
     // left by the earlier promotion pass that promotes intra-module targets.
     // This two-step promotion is to save the compile time. For LTO, it should
     // produce the same result as if we only do promotion here.
-    MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */,
-                                         PGOOpt && PGOOpt->SamplePGO));
+    MPM.addPass(PGOIndirectCallPromotion(
+        true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty()));
 
     // Propagate constants at call sites into the functions they call.  This
     // opens opportunities for globalopt (and inlining) by substituting function
@@ -868,7 +876,8 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // valuable as the inliner doesn't currently care whether it is inlining an
   // invoke or a call.
   // Run the inliner now.
-  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(InlinerPass()));
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+      InlinerPass(getInlineParamsFromOptLevel(Level))));
 
   // Optimize globals again after we ran the inliner.
   MPM.addPass(GlobalOptPass());
diff --git a/lib/ProfileData/Coverage/CoverageMapping.cpp b/lib/ProfileData/Coverage/CoverageMapping.cpp
index 4534e086b39e..8c5f136ea270 100644
--- a/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -54,26 +54,26 @@ Counter CounterExpressionBuilder::get(const CounterExpression &E) {
   return Counter::getExpression(I);
 }
 
-void CounterExpressionBuilder::extractTerms(
-    Counter C, int Sign, SmallVectorImpl<std::pair<unsigned, int>> &Terms) {
+void CounterExpressionBuilder::extractTerms(Counter C, int Factor,
+                                            SmallVectorImpl<Term> &Terms) {
   switch (C.getKind()) {
   case Counter::Zero:
     break;
   case Counter::CounterValueReference:
-    Terms.push_back(std::make_pair(C.getCounterID(), Sign));
+    Terms.emplace_back(C.getCounterID(), Factor);
     break;
   case Counter::Expression:
     const auto &E = Expressions[C.getExpressionID()];
-    extractTerms(E.LHS, Sign, Terms);
-    extractTerms(E.RHS, E.Kind == CounterExpression::Subtract ? -Sign : Sign,
-                 Terms);
+    extractTerms(E.LHS, Factor, Terms);
+    extractTerms(
+        E.RHS, E.Kind == CounterExpression::Subtract ? -Factor : Factor, Terms);
     break;
   }
 }
 
 Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) {
   // Gather constant terms.
-  SmallVector<std::pair<unsigned, int>, 32> Terms;
+  SmallVector<Term, 32> Terms;
   extractTerms(ExpressionTree, +1, Terms);
 
   // If there are no terms, this is just a zero. The algorithm below assumes at
@@ -82,17 +82,15 @@ Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) {
     return Counter::getZero();
 
   // Group the terms by counter ID.
-  std::sort(Terms.begin(), Terms.end(),
-            [](const std::pair<unsigned, int> &LHS,
-               const std::pair<unsigned, int> &RHS) {
-    return LHS.first < RHS.first;
+  std::sort(Terms.begin(), Terms.end(), [](const Term &LHS, const Term &RHS) {
+    return LHS.CounterID < RHS.CounterID;
   });
 
   // Combine terms by counter ID to eliminate counters that sum to zero.
   auto Prev = Terms.begin();
   for (auto I = Prev + 1, E = Terms.end(); I != E; ++I) {
-    if (I->first == Prev->first) {
-      Prev->second += I->second;
+    if (I->CounterID == Prev->CounterID) {
+      Prev->Factor += I->Factor;
       continue;
     }
     ++Prev;
@@ -103,24 +101,24 @@ Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) {
   Counter C;
   // Create additions. We do this before subtractions to avoid constructs like
   // ((0 - X) + Y), as opposed to (Y - X).
-  for (auto Term : Terms) {
-    if (Term.second <= 0)
+  for (auto T : Terms) {
+    if (T.Factor <= 0)
       continue;
-    for (int I = 0; I < Term.second; ++I)
+    for (int I = 0; I < T.Factor; ++I)
       if (C.isZero())
-        C = Counter::getCounter(Term.first);
+        C = Counter::getCounter(T.CounterID);
       else
         C = get(CounterExpression(CounterExpression::Add, C,
-                                  Counter::getCounter(Term.first)));
+                                  Counter::getCounter(T.CounterID)));
   }
 
   // Create subtractions.
-  for (auto Term : Terms) {
-    if (Term.second >= 0)
+  for (auto T : Terms) {
+    if (T.Factor >= 0)
       continue;
-    for (int I = 0; I < -Term.second; ++I)
+    for (int I = 0; I < -T.Factor; ++I)
       C = get(CounterExpression(CounterExpression::Subtract, C,
-                                Counter::getCounter(Term.first)));
+                                Counter::getCounter(T.CounterID)));
   }
   return C;
 }
@@ -247,18 +245,6 @@ Error CoverageMapping::loadFunctionRecord(
   return Error::success();
 }
 
-Expected<std::unique_ptr<CoverageMapping>>
-CoverageMapping::load(CoverageMappingReader &CoverageReader,
-                      IndexedInstrProfReader &ProfileReader) {
-  auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
-
-  for (const auto &Record : CoverageReader)
-    if (Error E = Coverage->loadFunctionRecord(Record, ProfileReader))
-      return std::move(E);
-
-  return std::move(Coverage);
-}
-
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
     IndexedInstrProfReader &ProfileReader) {
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index 005061c4f068..a1d18724fcd5 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -504,9 +504,11 @@ void InstrProfRecord::mergeValueProfData(uint32_t ValueKind,
     SIPE.addError(instrprof_error::value_site_count_mismatch);
     return;
   }
+  if (!ThisNumValueSites)
+    return;
   std::vector<InstrProfValueSiteRecord> &ThisSiteRecords =
-      getValueSitesForKind(ValueKind);
-  std::vector<InstrProfValueSiteRecord> &OtherSiteRecords =
+      getOrCreateValueSitesForKind(ValueKind);
+  MutableArrayRef<InstrProfValueSiteRecord> OtherSiteRecords =
       Src.getValueSitesForKind(ValueKind);
   for (uint32_t I = 0; I < ThisNumValueSites; I++)
     ThisSiteRecords[I].merge(SIPE, OtherSiteRecords[I], Weight);
@@ -533,11 +535,8 @@ void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight) {
 }
 
 void InstrProfRecord::scaleValueProfData(uint32_t ValueKind, uint64_t Weight) {
-  uint32_t ThisNumValueSites = getNumValueSites(ValueKind);
-  std::vector<InstrProfValueSiteRecord> &ThisSiteRecords =
-      getValueSitesForKind(ValueKind);
-  for (uint32_t I = 0; I < ThisNumValueSites; I++)
-    ThisSiteRecords[I].scale(SIPE, Weight);
+  for (auto &R : getValueSitesForKind(ValueKind))
+    R.scale(SIPE, Weight);
 }
 
 void InstrProfRecord::scale(uint64_t Weight) {
@@ -583,7 +582,7 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
     VData[I].Value = remapValue(VData[I].Value, ValueKind, ValueMap);
   }
   std::vector<InstrProfValueSiteRecord> &ValueSites =
-      getValueSitesForKind(ValueKind);
+      getOrCreateValueSitesForKind(ValueKind);
   if (N == 0)
     ValueSites.emplace_back();
   else
@@ -642,8 +641,9 @@ static ValueProfRecordClosure InstrProfRecordClosure = {
 
 // Wrapper implementation using the closure mechanism.
 uint32_t ValueProfData::getSize(const InstrProfRecord &Record) {
-  InstrProfRecordClosure.Record = &Record;
-  return getValueProfDataSize(&InstrProfRecordClosure);
+  auto Closure = InstrProfRecordClosure;
+  Closure.Record = &Record;
+  return getValueProfDataSize(&Closure);
 }
 
 // Wrapper implementation using the closure mechanism.
diff --git a/lib/Support/AMDGPUCodeObjectMetadata.cpp b/lib/Support/AMDGPUCodeObjectMetadata.cpp
index a00e371415a3..863093ab7def 100644
--- a/lib/Support/AMDGPUCodeObjectMetadata.cpp
+++ b/lib/Support/AMDGPUCodeObjectMetadata.cpp
@@ -20,8 +20,6 @@
 using namespace llvm::AMDGPU;
 using namespace llvm::AMDGPU::CodeObject;
 
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string)
 LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Arg::Metadata)
 LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata)
 
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 234f7439a546..232efe648b03 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -327,6 +327,7 @@ enum ProcessorSubtypes {
   INTEL_COREI7_SKYLAKE_AVX512,
   INTEL_ATOM_BONNELL,
   INTEL_ATOM_SILVERMONT,
+  INTEL_ATOM_GOLDMONT,
   INTEL_KNIGHTS_LANDING,
   AMDPENTIUM_K6,
   AMDPENTIUM_K62,
@@ -707,7 +708,12 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
       *Type = INTEL_ATOM;
       *Subtype = INTEL_ATOM_SILVERMONT;
       break; // "silvermont"
-
+    // Goldmont:
+    case 0x5c:
+    case 0x5f:
+      *Type = INTEL_ATOM;
+      *Subtype = INTEL_ATOM_GOLDMONT;
+      break; // "goldmont"
     case 0x57:
       *Type = INTEL_XEONPHI; // knl
       *Subtype = INTEL_KNIGHTS_LANDING;
@@ -1070,6 +1076,8 @@ StringRef sys::getHostCPUName() {
       switch (Subtype) {
       case INTEL_ATOM_BONNELL:
         return "bonnell";
+      case INTEL_ATOM_GOLDMONT:
+        return "goldmont";
       case INTEL_ATOM_SILVERMONT:
         return "silvermont";
       default:
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 227e792d83dc..85e782b2c048 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -240,11 +240,9 @@ getMemoryBufferForStream(int FD, const Twine &BufferName) {
   // Read into Buffer until we hit EOF.
   do {
     Buffer.reserve(Buffer.size() + ChunkSize);
-    ReadBytes = read(FD, Buffer.end(), ChunkSize);
-    if (ReadBytes == -1) {
-      if (errno == EINTR) continue;
+    ReadBytes = sys::RetryAfterSignal(-1, read, FD, Buffer.end(), ChunkSize);
+    if (ReadBytes == -1)
       return std::error_code(errno, std::generic_category());
-    }
     Buffer.set_size(Buffer.size() + ReadBytes);
   } while (ReadBytes != 0);
 
@@ -391,13 +389,12 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
 
   while (BytesLeft) {
 #ifdef HAVE_PREAD
-    ssize_t NumRead = ::pread(FD, BufPtr, BytesLeft, MapSize-BytesLeft+Offset);
+    ssize_t NumRead = sys::RetryAfterSignal(-1, ::pread, FD, BufPtr, BytesLeft,
+                                            MapSize - BytesLeft + Offset);
 #else
-    ssize_t NumRead = ::read(FD, BufPtr, BytesLeft);
+    ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, BufPtr, BytesLeft);
 #endif
     if (NumRead == -1) {
-      if (errno == EINTR)
-        continue;
       // Error while reading.
       return std::error_code(errno, std::generic_category());
     }
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index b16351906a4c..13bb6f23bc83 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -784,6 +784,42 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) {
   return 0;
 }
 
+StringRef llvm::ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) {
+  StringRef ArchName =
+      CPU.empty() ? TT.getArchName() : ARM::getArchName(ARM::parseCPUArch(CPU));
+
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getEnvironment() == Triple::EABI ||
+        TT.getOS() == Triple::UnknownOS ||
+        llvm::ARM::parseArchProfile(ArchName) == ARM::PK_M)
+      return "aapcs";
+    if (TT.isWatchABI())
+      return "aapcs16";
+    return "apcs-gnu";
+  } else if (TT.isOSWindows())
+    // FIXME: this is invalid for WindowsCE.
+    return "aapcs";
+
+  // Select the default based on the platform.
+  switch (TT.getEnvironment()) {
+  case Triple::Android:
+  case Triple::GNUEABI:
+  case Triple::GNUEABIHF:
+  case Triple::MuslEABI:
+  case Triple::MuslEABIHF:
+    return "aapcs-linux";
+  case Triple::EABIHF:
+  case Triple::EABI:
+    return "aapcs";
+  default:
+    if (TT.isOSNetBSD())
+      return "apcs-gnu";
+    if (TT.isOSOpenBSD())
+      return "aapcs-linux";
+    return "aapcs";
+  }
+}
+
 StringRef llvm::AArch64::getCanonicalArchName(StringRef Arch) {
   return ARM::getCanonicalArchName(Arch);
 }
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index b6774692595b..45097eb918b7 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -737,10 +737,8 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
 #ifdef O_CLOEXEC
   OpenFlags |= O_CLOEXEC;
 #endif
-  while ((ResultFD = open(P.begin(), OpenFlags)) < 0) {
-    if (errno != EINTR)
-      return std::error_code(errno, std::generic_category());
-  }
+  if ((ResultFD = sys::RetryAfterSignal(-1, open, P.begin(), OpenFlags)) < 0)
+    return std::error_code(errno, std::generic_category());
 #ifndef O_CLOEXEC
   int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC);
   (void)r;
@@ -800,10 +798,8 @@ std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
 
   SmallString<128> Storage;
   StringRef P = Name.toNullTerminatedStringRef(Storage);
-  while ((ResultFD = open(P.begin(), OpenFlags, Mode)) < 0) {
-    if (errno != EINTR)
-      return std::error_code(errno, std::generic_category());
-  }
+  if ((ResultFD = sys::RetryAfterSignal(-1, open, P.begin(), OpenFlags, Mode)) < 0)
+    return std::error_code(errno, std::generic_category());
 #ifndef O_CLOEXEC
   int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC);
   (void)r;
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 1d0143c6716e..2d4662094682 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -207,13 +207,10 @@ std::error_code Process::FixupStandardFileDescriptors() {
   for (int StandardFD : StandardFDs) {
     struct stat st;
     errno = 0;
-    while (fstat(StandardFD, &st) < 0) {
+    if (RetryAfterSignal(-1, fstat, StandardFD, &st) < 0) {
       assert(errno && "expected errno to be set if fstat failed!");
       // fstat should return EBADF if the file descriptor is closed.
-      if (errno == EBADF)
-        break;
-      // retry fstat if we got EINTR, otherwise bubble up the failure.
-      if (errno != EINTR)
+      if (errno != EBADF)
         return std::error_code(errno, std::generic_category());
     }
     // if fstat succeeds, move on to the next FD.
@@ -222,11 +219,8 @@ std::error_code Process::FixupStandardFileDescriptors() {
     assert(errno == EBADF && "expected errno to have EBADF at this point!");
 
     if (NullFD < 0) {
-      while ((NullFD = open("/dev/null", O_RDWR)) < 0) {
-        if (errno == EINTR)
-          continue;
+      if ((NullFD = RetryAfterSignal(-1, open, "/dev/null", O_RDWR)) < 0)
         return std::error_code(errno, std::generic_category());
-      }
     }
 
     if (NullFD == StandardFD)
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
index f27bc97ec3f3..0a948812ff33 100644
--- a/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -22,7 +22,7 @@
 ///    cbz w8, .LBB1_2 -> b.eq .LBB1_2
 ///
 /// 3) sub w8, w0, w1       -> subs w8, w0, w1   ; w8 has multiple uses.
-///    tbz w8, #31, .LBB6_2 -> b.ge .LBB6_2
+///    tbz w8, #31, .LBB6_2 -> b.pl .LBB6_2
 ///
 //===----------------------------------------------------------------------===//
 
@@ -129,11 +129,11 @@ MachineInstr *AArch64CondBrTuning::convertToCondBr(MachineInstr &MI) {
     break;
   case AArch64::TBZW:
   case AArch64::TBZX:
-    CC = AArch64CC::GE;
+    CC = AArch64CC::PL;
     break;
   case AArch64::TBNZW:
   case AArch64::TBNZX:
-    CC = AArch64CC::LT;
+    CC = AArch64CC::MI;
     break;
   }
   return BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::Bcc))
@@ -271,6 +271,7 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
     }
     break;
   }
+  (void)NewCmp; (void)NewBr;
   assert(NewCmp && NewBr && "Expected new instructions.");
 
   DEBUG(dbgs() << "  with instruction:\n    ");
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 00a0111f2bd2..9eda56c825a9 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -139,6 +140,7 @@ class SSACCmpConv {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
+  const MachineBranchProbabilityInfo *MBPI;
 
 public:
   /// The first block containing a conditional branch, dominating everything
@@ -186,8 +188,10 @@ private:
 
 public:
   /// runOnMachineFunction - Initialize per-function data structures.
-  void runOnMachineFunction(MachineFunction &MF) {
+  void runOnMachineFunction(MachineFunction &MF,
+                            const MachineBranchProbabilityInfo *MBPI) {
     this->MF = &MF;
+    this->MBPI = MBPI;
     TII = MF.getSubtarget().getInstrInfo();
     TRI = MF.getSubtarget().getRegisterInfo();
     MRI = &MF.getRegInfo();
@@ -564,8 +568,40 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   // All CmpBB instructions are moved into Head, and CmpBB is deleted.
   // Update the CFG first.
   updateTailPHIs();
-  Head->removeSuccessor(CmpBB, true);
-  CmpBB->removeSuccessor(Tail, true);
+
+  // Save successor probabilties before removing CmpBB and Tail from their
+  // parents.
+  BranchProbability Head2CmpBB = MBPI->getEdgeProbability(Head, CmpBB);
+  BranchProbability CmpBB2Tail = MBPI->getEdgeProbability(CmpBB, Tail);
+
+  Head->removeSuccessor(CmpBB);
+  CmpBB->removeSuccessor(Tail);
+
+  // If Head and CmpBB had successor probabilties, udpate the probabilities to
+  // reflect the ccmp-conversion.
+  if (Head->hasSuccessorProbabilities() && CmpBB->hasSuccessorProbabilities()) {
+
+    // Head is allowed two successors. We've removed CmpBB, so the remaining
+    // successor is Tail. We need to increase the successor probability for
+    // Tail to account for the CmpBB path we removed.
+    //
+    // Pr(Tail|Head) += Pr(CmpBB|Head) * Pr(Tail|CmpBB).
+    assert(*Head->succ_begin() == Tail && "Head successor is not Tail");
+    BranchProbability Head2Tail = MBPI->getEdgeProbability(Head, Tail);
+    Head->setSuccProbability(Head->succ_begin(),
+                             Head2Tail + Head2CmpBB * CmpBB2Tail);
+
+    // We will transfer successors of CmpBB to Head in a moment without
+    // normalizing the successor probabilities. Set the successor probabilites
+    // before doing so.
+    //
+    // Pr(I|Head) = Pr(CmpBB|Head) * Pr(I|CmpBB).
+    for (auto I = CmpBB->succ_begin(), E = CmpBB->succ_end(); I != E; ++I) {
+      BranchProbability CmpBB2I = MBPI->getEdgeProbability(CmpBB, *I);
+      CmpBB->setSuccProbability(I, Head2CmpBB * CmpBB2I);
+    }
+  }
+
   Head->transferSuccessorsAndUpdatePHIs(CmpBB);
   DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
   TII->removeBranch(*Head);
@@ -717,6 +753,7 @@ int SSACCmpConv::expectedCodeSizeDelta() const {
 
 namespace {
 class AArch64ConditionalCompares : public MachineFunctionPass {
+  const MachineBranchProbabilityInfo *MBPI;
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   MCSchedModel SchedModel;
@@ -753,6 +790,7 @@ char AArch64ConditionalCompares::ID = 0;
 
 INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
                       "AArch64 CCMP Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
 INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
@@ -763,6 +801,7 @@ FunctionPass *llvm::createAArch64ConditionalCompares() {
 }
 
 void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
@@ -892,12 +931,13 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
   Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
   MinSize = MF.getFunction()->optForMinSize();
 
   bool Changed = false;
-  CmpConv.runOnMachineFunction(MF);
+  CmpConv.runOnMachineFunction(MF, MBPI);
 
   // Visit blocks in dominator tree pre-order. The pre-order enables multiple
   // cmp-conversions from the same head block.
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2965106fd270..aaf32a499bc3 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7561,8 +7561,9 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
 
       // Convert the integer vector to pointer vector if the element is pointer.
       if (EltTy->isPointerTy())
-        SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
-
+        SubVec = Builder.CreateIntToPtr(
+            SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
+                                    VecTy->getVectorNumElements()));
       SubVecs[SVI].push_back(SubVec);
     }
   }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index ad24612239fa..6cb723d187af 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -735,7 +735,7 @@ def : ShiftAlias<"rorv", RORVWr, GPR32>;
 def : ShiftAlias<"rorv", RORVXr, GPR64>;
 
 // Multiply-add
-let AddedComplexity = 7 in {
+let AddedComplexity = 5 in {
 defm MADD : MulAccum<0, "madd", add>;
 defm MSUB : MulAccum<1, "msub", sub>;
 
@@ -752,7 +752,7 @@ def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
           (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
 def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
           (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-} // AddedComplexity = 7
+} // AddedComplexity = 5
 
 let AddedComplexity = 5 in {
 def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 9bfd570e9a82..07ce0e863c5e 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -947,7 +947,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
 
     if (DstRB.getID() != SrcRB.getID()) {
-      DEBUG(dbgs() << "G_TRUNC input/output on different banks\n");
+      DEBUG(dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
       return false;
     }
 
@@ -964,16 +964,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
 
       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
-        DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+        DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
         return false;
       }
 
       if (DstRC == SrcRC) {
         // Nothing to be done
+      } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
+                 SrcTy == LLT::scalar(64)) {
+        llvm_unreachable("TableGen can import this case");
+        return false;
       } else if (DstRC == &AArch64::GPR32RegClass &&
                  SrcRC == &AArch64::GPR64RegClass) {
         I.getOperand(1).setSubReg(AArch64::sub_32);
       } else {
+        DEBUG(dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
         return false;
       }
 
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 01196817f311..4b568f3fba2b 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -39,6 +39,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   const LLT v4s32 = LLT::vector(4, 32);
   const LLT v2s64 = LLT::vector(2, 64);
 
+  for (auto Ty : {p0, s1, s8, s16, s32, s64})
+    setAction({G_IMPLICIT_DEF, Ty}, Legal);
+
   for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
     // These operations naturally get the right answer when used on
     // GPR32, even if the actual type is narrower.
@@ -99,6 +102,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap).
   }
 
+  for (auto Ty : {s1, s8, s16, s32, s64, p0})
+    setAction({G_EXTRACT, Ty}, Legal);
+
+  for (auto Ty : {s32, s64})
+    setAction({G_EXTRACT, 1, Ty}, Legal);
+
   for (unsigned MemOp : {G_LOAD, G_STORE}) {
     for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
       setAction({MemOp, Ty}, Legal);
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 45083df7ab45..f82b9dbc2c9f 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -151,13 +151,24 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
   return MCOperand::createExpr(Expr);
 }
 
+MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO,
+                                                     MCSymbol *Sym) const {
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+  return MCOperand::createExpr(Expr);
+}
+
 MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                                  MCSymbol *Sym) const {
   if (Printer.TM.getTargetTriple().isOSDarwin())
     return lowerSymbolOperandDarwin(MO, Sym);
+  if (Printer.TM.getTargetTriple().isOSBinFormatCOFF())
+    return lowerSymbolOperandCOFF(MO, Sym);
 
-  assert(Printer.TM.getTargetTriple().isOSBinFormatELF() &&
-         "Expect Darwin or ELF target");
+  assert(Printer.TM.getTargetTriple().isOSBinFormatELF() && "Invalid target");
   return lowerSymbolOperandELF(MO, Sym);
 }
 
diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h
index 1e29b80c2d62..aa30fe1fa707 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.h
+++ b/lib/Target/AArch64/AArch64MCInstLower.h
@@ -42,6 +42,8 @@ public:
                                      MCSymbol *Sym) const;
   MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
                                   MCSymbol *Sym) const;
+  MCOperand lowerSymbolOperandCOFF(const MachineOperand &MO,
+                                   MCSymbol *Sym) const;
   MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
 
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index baf15ac540cf..fab92e139dd0 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -94,7 +94,7 @@ const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
   if (TT.isOSDarwin())
     return CSR_AArch64_TLS_Darwin_RegMask;
 
-  assert(TT.isOSBinFormatELF() && "only expect Darwin or ELF TLS");
+  assert(TT.isOSBinFormatELF() && "Invalid target");
   return CSR_AArch64_TLS_ELF_RegMask;
 }
 
diff --git a/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index 3654eeca530a..10df50bcf156 100644
--- a/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -1,4 +1,4 @@
-//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 Scheduling ---*- tablegen -*-=//
+//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 ---*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -79,75 +79,207 @@ def THX2T99LS01 : ProcResGroup<[THX2T99P4, THX2T99P5]>;
 
 // 60 entry unified scheduler.
 def THX2T99Any : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2,
-                              THX2T99P3, THX2T99P4, THX2T99P5]> {
-  let BufferSize=60;
+                               THX2T99P3, THX2T99P4, THX2T99P5]> {
+  let BufferSize = 60;
 }
 
 // Define commonly used write types for InstRW specializations.
 // All definitions follow the format: THX2T99Write_<NumCycles>Cyc_<Resources>.
 
 // 3 cycles on I1.
-def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 3; }
+def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+// 1 cycles on I2.
+def THX2T99Write_1Cyc_I2 : SchedWriteRes<[THX2T99I2]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
 
 // 4 cycles on I1.
-def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 4; }
+def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+// 23 cycles on I1.
+def THX2T99Write_23Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 23;
+  let ResourceCycles = [13, 23];
+  let NumMicroOps = 4;
+}
+
+// 39 cycles on I1.
+def THX2T99Write_39Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 39;
+  let ResourceCycles = [13, 39];
+  let NumMicroOps = 4;
+}
 
 // 1 cycle on I0, I1, or I2.
-def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> { let Latency = 1; }
+def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 2 cycles on I0, I1, or I2.
+def THX2T99Write_2Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on I0, I1, or I2.
+def THX2T99Write_4Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+// 5 cycles on I0, I1, or I2.
+def THX2T99Write_5Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
 
 // 5 cycles on F1.
-def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 5; }
+def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
 
 // 7 cycles on F1.
-def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 7; }
+def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
 
 // 4 cycles on F0 or F1.
-def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 4; }
+def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
 
 // 5 cycles on F0 or F1.
-def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 5; }
+def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
 
 // 6 cycles on F0 or F1.
-def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 6; }
+def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
 
 // 7 cycles on F0 or F1.
-def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 7; }
+def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
 
 // 8 cycles on F0 or F1.
-def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 8; }
+def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+// 10 cycles on F0 or F1.
+def THX2T99Write_10Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
 
 // 16 cycles on F0 or F1.
 def THX2T99Write_16Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
   let Latency = 16;
+  let NumMicroOps = 3;
   let ResourceCycles = [8];
 }
 
 // 23 cycles on F0 or F1.
 def THX2T99Write_23Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
   let Latency = 23;
+  let NumMicroOps = 3;
   let ResourceCycles = [11];
 }
 
 // 1 cycles on LS0 or LS1.
-def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 1; }
+def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 0;
+}
+
+// 1 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_1Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+// 1 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_1Cyc_LS01_I012_I012 :
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+  let Latency = 0;
+  let NumMicroOps = 3;
+}
+
+// 2 cycles on LS0 or LS1.
+def THX2T99Write_2Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
 
 // 4 cycles on LS0 or LS1.
-def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 4; }
+def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
 
 // 5 cycles on LS0 or LS1.
-def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 5; }
+def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
 
 // 6 cycles on LS0 or LS1.
-def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 6; }
+def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_4Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_4Cyc_LS01_I012_I012 :
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
 
 // 5 cycles on LS0 or LS1 and I0, I1, or I2.
 def THX2T99Write_5Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
   let Latency = 5;
-  let NumMicroOps = 2;
+  let NumMicroOps = 3;
 }
 
 // 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
-def THX2T99Write_6Cyc_LS01_I012_I012 : 
+def THX2T99Write_5Cyc_LS01_I012_I012 :
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_6Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+// 6 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_6Cyc_LS01_I012_I012 :
   SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
   let Latency = 6;
   let NumMicroOps = 3;
@@ -162,25 +294,25 @@ def THX2T99Write_1Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
 // 5 cycles on LS0 or LS1 and F0 or F1.
 def THX2T99Write_5Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 5;
-  let NumMicroOps = 2;
+  let NumMicroOps = 3;
 }
 
 // 6 cycles on LS0 or LS1 and F0 or F1.
 def THX2T99Write_6Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 6;
-  let NumMicroOps = 2;
+  let NumMicroOps = 3;
 }
 
 // 7 cycles on LS0 or LS1 and F0 or F1.
 def THX2T99Write_7Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 7;
-  let NumMicroOps = 2;
+  let NumMicroOps = 3;
 }
 
 // 8 cycles on LS0 or LS1 and F0 or F1.
 def THX2T99Write_8Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 8;
-  let NumMicroOps = 2;
+  let NumMicroOps = 3;
 }
 
 // Define commonly used read types.
@@ -195,10 +327,8 @@ def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
-
 }
 
-
 //===----------------------------------------------------------------------===//
 // 3. Instruction Tables.
 
@@ -211,88 +341,217 @@ let SchedModel = ThunderX2T99Model in {
 // Branch, immed
 // Branch and link, immed
 // Compare and branch
-def : WriteRes<WriteBr,      [THX2T99I2]> { let Latency = 1; }
+def : WriteRes<WriteBr,      [THX2T99I2]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg,   [THX2T99I2]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
 
 def : WriteRes<WriteSys,     []> { let Latency = 1; }
 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
 def : WriteRes<WriteHint,    []> { let Latency = 1; }
 
-def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+def : WriteRes<WriteAtomic,  []> {
+  let Unsupported = 1;
+  let NumMicroOps = 2;
+}
 
-// Branch, register
-// Branch and link, register != LR
-// Branch and link, register = LR
-def : WriteRes<WriteBrReg,   [THX2T99I2]> { let Latency = 1; }
+//---
+// Branch
+//---
+def : InstRW<[THX2T99Write_1Cyc_I2], (instrs B, BL, BR, BLR)>;
+def : InstRW<[THX2T99Write_1Cyc_I2], (instrs RET)>;
+def : InstRW<[THX2T99Write_1Cyc_I2], (instregex "^B.*")>;
+def : InstRW<[THX2T99Write_1Cyc_I2],
+            (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>;
 
 //---
 // 3.2 Arithmetic and Logical Instructions
 // 3.3 Move and Shift Instructions
 //---
 
+
 // ALU, basic
 // Conditional compare
 // Conditional select
 // Address generation
-def : WriteRes<WriteI,       [THX2T99I012]> { let Latency = 1; }
+def : WriteRes<WriteI,       [THX2T99I012]> {
+  let Latency = 1;
+  let ResourceCycles = [1, 3];
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteI],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
+                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
+                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
+                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+                       "CSNEG?(W|X)r(i|r|s|x)")>;
+
 def : InstRW<[WriteI], (instrs COPY)>;
 
 // ALU, extend and/or shift
 def : WriteRes<WriteISReg,   [THX2T99I012]> {
   let Latency = 2;
-  let ResourceCycles = [2];
+  let ResourceCycles = [2, 3];
+  let NumMicroOps = 2;
 }
 
+def : InstRW<[WriteISReg],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
+                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
+                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
+                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+                       "CSNEG?(W|X)r(i|r|s|x)")>;
+
 def : WriteRes<WriteIEReg,   [THX2T99I012]> {
-  let Latency = 2;
-  let ResourceCycles = [2];
+  let Latency = 1;
+  let ResourceCycles = [1, 3];
+  let NumMicroOps = 2;
 }
 
+def : InstRW<[WriteIEReg],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
+                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
+                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
+                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+                       "CSNEG?(W|X)r(i|r|s|x)")>;
+
 // Move immed
-def : WriteRes<WriteImm,     [THX2T99I012]> { let Latency = 1; }
+def : WriteRes<WriteImm,     [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[THX2T99Write_1Cyc_I012],
+            (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_I012],
+            (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>;
 
 // Variable shift
-def : WriteRes<WriteIS,      [THX2T99I012]> { let Latency = 1; }
+def : WriteRes<WriteIS,      [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
 
 //---
 // 3.4 Divide and Multiply Instructions
 //---
 
 // Divide, W-form
-// Latency range of 13-23.  Take the average.
+// Latency range of 13-23/13-39.
 def : WriteRes<WriteID32,    [THX2T99I1]> {
-  let Latency = 18;
-  let ResourceCycles = [18];
+  let Latency = 39;
+  let ResourceCycles = [13, 39];
+  let NumMicroOps = 4;
 }
 
 // Divide, X-form
-// Latency range of 13-39.  Take the average.
 def : WriteRes<WriteID64,    [THX2T99I1]> {
-  let Latency = 26;
-  let ResourceCycles = [26];
+  let Latency = 23;
+  let ResourceCycles = [13, 23];
+  let NumMicroOps = 4;
 }
 
 // Multiply accumulate, W-form
-def : WriteRes<WriteIM32,    [THX2T99I012]> { let Latency = 5; }
+def : WriteRes<WriteIM32,    [THX2T99I012]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
 
 // Multiply accumulate, X-form
-def : WriteRes<WriteIM64,    [THX2T99I012]> { let Latency = 5; }
+def : WriteRes<WriteIM64,    [THX2T99I012]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+//def : InstRW<[WriteIM32, ReadIM, ReadIM, ReadIMA, THX2T99Write_5Cyc_I012],
+//             (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[THX2T99Write_5Cyc_I012],
+            (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+
+def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>;
 
 // Bitfield extract, two reg
-def : WriteRes<WriteExtr,    [THX2T99I012]> { let Latency = 1; }
+def : WriteRes<WriteExtr,    [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Multiply high
+def : InstRW<[THX2T99Write_4Cyc_I1], (instrs SMULHrr, UMULHrr)>;
+
+// Miscellaneous Data-Processing Instructions
+// Bitfield extract
+def : InstRW<[THX2T99Write_1Cyc_I012], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitifield move - basic
+def : InstRW<[THX2T99Write_1Cyc_I012],
+            (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>;
 
-// Bitfield move, basic
 // Bitfield move, insert
-// NOTE: Handled by WriteIS.
+def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "^BFM")>;
+def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "(S|U)?BFM.*")>;
 
 // Count leading
 def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$",
-                                               "^CLZ(W|X)r$")>;
+                                                "^CLZ(W|X)r$")>;
+
+// Reverse bits
+def : InstRW<[THX2T99Write_1Cyc_I012], (instrs RBITWr, RBITXr)>;
+
+// Cryptography Extensions
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES[DE]")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AESI?MC")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1SU0")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1[CMP]")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256SU0")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256(H|H2|SU1)")>;
+
+// CRC Instructions
+// def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32", "^CRC32C")>;
+def : InstRW<[THX2T99Write_4Cyc_I1],
+            (instrs CRC32Brr, CRC32Hrr, CRC32Wrr, CRC32Xrr)>;
+
+def : InstRW<[THX2T99Write_4Cyc_I1],
+            (instrs CRC32CBrr, CRC32CHrr, CRC32CWrr, CRC32CXrr)>;
 
 // Reverse bits/bytes
 // NOTE: Handled by WriteI.
 
 //---
-// 3.6 Load Instructions 
+// 3.6 Load Instructions
 // 3.10 FP Load Instructions
 //---
 
@@ -300,13 +559,29 @@ def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$",
 // Load register, unscaled immed
 // Load register, immed unprivileged
 // Load register, unsigned immed
-def : WriteRes<WriteLD,      [THX2T99LS01]> { let Latency = 4; }
+def : WriteRes<WriteLD,      [THX2T99LS01]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
 
 // Load register, immed post-index
 // NOTE: Handled by WriteLD, WriteI.
 // Load register, immed pre-index
 // NOTE: Handled by WriteLD, WriteAdr.
-def : WriteRes<WriteAdr,     [THX2T99I012]> { let Latency = 1; }
+def : WriteRes<WriteAdr,     [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op.  Thus
+// the resources are handled by WriteLD.
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 5;
+  let NumMicroOps = 5;
+}
 
 // Load register offset, basic
 // Load register, register offset, scale by 4/8
@@ -324,23 +599,229 @@ def THX2T99ReadAdrBase : SchedReadVariant<[
   SchedVar<NoSchedPred,   [ReadDefault]>]>;
 def : SchedAlias<ReadAdrBase, THX2T99ReadAdrBase>;
 
-// Load pair, immed offset, normal
-// Load pair, immed offset, signed words, base != SP
-// Load pair, immed offset signed words, base = SP
-// LDP only breaks into *one* LS micro-op.  Thus
-// the resources are handling by WriteLD.
-def : WriteRes<WriteLDHi,    []> {
-  let Latency = 5;
-}
-
 // Load pair, immed pre-index, normal
 // Load pair, immed pre-index, signed words
 // Load pair, immed post-index, normal
 // Load pair, immed post-index, signed words
 // NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
 
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPXi)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPXi)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRBui)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDui)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRHui)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRQui)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRSui)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRQl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRWl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRXl)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRXi)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSWi)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpost)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpost)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRBpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRHpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPXpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRBpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRHpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroW)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroX)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+             (instrs LDRDroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRQroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRBroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRDroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRQroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHXroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRXroX)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURDi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURQi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSWi)>;
+
+//---
+// Prefetch
+//---
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMl)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFUMi)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMui)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroW)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroX)>;
+
 //--
-// 3.7 Store Instructions 
+// 3.7 Store Instructions
 // 3.11 FP Store Instructions
 //--
 
@@ -382,6 +863,195 @@ def : WriteRes<WriteSTP,     [THX2T99LS01, THX2T99SD]> {
 // Store pair, immed pre-index, X-form
 // NOTE: Handled by WriteAdr, WriteSTP.
 
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBBi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHHi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURSi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURWi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURXi)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRBi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRHi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRWi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRXi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPXi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPWi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPXi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPWi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRBui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRBui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRDui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRDui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRHui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRHui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRQui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRQui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRXui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRXui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRWui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRWui)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPXpre, STPXpost)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRXpre, STRXpost)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBroW, STRBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBroW, STRBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRDroW, STRDroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRDroW, STRDroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHroW, STRHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHroW, STRHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRQroW, STRQroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRQroW, STRQroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRSroW, STRSroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRSroW, STRSroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRWroW, STRWroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRWroW, STRWroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRXroW, STRXroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRXroW, STRXroX)>;
+
 //---
 // 3.8 FP Data Processing Instructions
 //---
@@ -389,28 +1059,95 @@ def : WriteRes<WriteSTP,     [THX2T99LS01, THX2T99SD]> {
 // FP absolute value
 // FP min/max
 // FP negate
-def : WriteRes<WriteF,       [THX2T99F01]> { let Latency = 5; }
+def : WriteRes<WriteF,       [THX2T99F01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
 
 // FP arithmetic
 def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
 
 // FP compare
-def : WriteRes<WriteFCmp,    [THX2T99F01]> { let Latency = 5; }
+def : WriteRes<WriteFCmp,    [THX2T99F01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
 
-// FP divide, S-form
-// FP square root, S-form
-def : WriteRes<WriteFDiv,    [THX2T99F01]> {
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFDiv, [THX2T99F01]> {
+  let Latency = 22;
+  let ResourceCycles = [19];
+}
+
+def THX2T99XWriteFDiv : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+  let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFDivSP : SchedWriteRes<[THX2T99F01]> {
   let Latency = 16;
   let ResourceCycles = [8];
+  let NumMicroOps = 4;
 }
 
+def THX2T99XWriteFDivDP : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 23;
+  let ResourceCycles = [12];
+  let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFSqrtSP : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+  let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFSqrtDP : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 23;
+  let ResourceCycles = [12];
+  let NumMicroOps = 4;
+}
+
+// FP divide, S-form
+// FP square root, S-form
+def : InstRW<[THX2T99XWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[THX2T99XWriteFSqrtSP], (instrs FSQRTSr)>;
+def : InstRW<[THX2T99XWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[THX2T99XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSrr")>;
+
 // FP divide, D-form
 // FP square root, D-form
-def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
+def : InstRW<[THX2T99XWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[THX2T99XWriteFSqrtDP], (instrs FSQRTDr)>;
+def : InstRW<[THX2T99XWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[THX2T99XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDrr")>;
 
 // FP multiply
 // FP multiply accumulate
-def : WriteRes<WriteFMul, [THX2T99F01]> { let Latency = 6; }
+def : WriteRes<WriteFMul, [THX2T99F01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def THX2T99XWriteFMul : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def THX2T99XWriteFMulAcc : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def : InstRW<[THX2T99XWriteFMul], (instregex "^FMUL", "^FNMUL")>;
+def : InstRW<[THX2T99XWriteFMulAcc],
+            (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>;
 
 // FP round to integral
 def : InstRW<[THX2T99Write_7Cyc_F01],
@@ -426,15 +1163,25 @@ def : InstRW<[THX2T99Write_4Cyc_F01], (instregex "^FCSEL")>;
 // FP convert, from vec to vec reg
 // FP convert, from gen to vec reg
 // FP convert, from vec to gen reg
-def : WriteRes<WriteFCvt, [THX2T99F01]> { let Latency = 7; }
+def : WriteRes<WriteFCvt, [THX2T99F01]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
 
 // FP move, immed
 // FP move, register
-def : WriteRes<WriteFImm, [THX2T99F01]> { let Latency = 4; }
+def : WriteRes<WriteFImm, [THX2T99F01]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
 
 // FP transfer, from gen to vec reg
 // FP transfer, from vec to gen reg
-def : WriteRes<WriteFCopy, [THX2T99F01]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [THX2T99F01]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
 def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
 
 //---
@@ -470,19 +1217,135 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
 // ASIMD shift by register, basic, Q-form
 // ASIMD shift by register, complex, D-form
 // ASIMD shift by register, complex, Q-form
-def : WriteRes<WriteV, [THX2T99F01]> { let Latency = 7; }
+def : WriteRes<WriteV, [THX2T99F01]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [4, 23];
+}
 
 // ASIMD arith, reduce, 4H/4S
 // ASIMD arith, reduce, 8B/8H
 // ASIMD arith, reduce, 16B
-def : InstRW<[THX2T99Write_5Cyc_F01], 
-            (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
 
 // ASIMD logical (MOV, MVN, ORN, ORR)
-def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^ANDv", "^BICv", "^EORv", "^MOVv", "^MVNv",
+                       "^ORRv", "^ORNv", "^NOTv")>;
+// ASIMD arith, reduce
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
 
 // ASIMD polynomial (8x8) multiply long
-def : InstRW<[THX2T99Write_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]ABAL")>;
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B/16H
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^[SU]?ADDL?Vv16i8v$")>;
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B/16H
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+// ASIMD multiply, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^(P?MUL|SQR?DMULH)" #
+                       "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" #
+                       "(_indexed)?$")>;
+// ASIMD multiply, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD shift accumulate
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "RSHRNv","SHRNv", "SQRSHRNv","SQRSHRUNv",
+                       "SQSHRNv","SQSHRUNv", "UQRSHRNv",
+                       "UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+// ASIMD shift by immed, complex
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SQSHLU")>;
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// ASIMD shift by register, complex, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU][QR]{1,2}SHL" #
+                       "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD Arithmetic
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(RADD|RSUB)HNv.*")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD",
+                       "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" #
+                       "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADALP","^UADALP")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLPv","^UADDLPv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLV","^UADDLV")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+             (instregex "^ADDVv","^SMAXVv","^UMAXVv","^SMINVv","^UMINVv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+             (instregex "^SABAv","^UABAv","^SABALv","^UABALv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SQADDv","^SQSUBv","^UQADDv","^UQSUBv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SUQADDv","^USQADDv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^ADDHNv","^RADDHNv", "^RSUBHNv",
+                       "^SQABS", "^SQADD", "^SQNEG", "^SQSUB",
+                       "^SRHADD", "^SUBHNv", "^SUQADD",
+                       "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^CMEQv","^CMGEv","^CMGTv",
+                       "^CMLEv","^CMLTv", "^CMHIv","^CMHSv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SMAXv","^SMINv","^UMAXv","^UMINv",
+                       "^SMAXPv","^SMINPv","^UMAXPv","^UMINPv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SABDv","^UABDv", "^SABDLv","^UABDLv")>;
 
 //---
 // 3.13 ASIMD Floating-point Instructions
@@ -493,7 +1356,8 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FABSv")>;
 
 // ASIMD FP arith, normal, D-form
 // ASIMD FP arith, normal, Q-form
-def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FABDv", "^FADDv", "^FSUBv")>;
 
 // ASIMD FP arith,pairwise, D-form
 // ASIMD FP arith, pairwise, Q-form
@@ -503,8 +1367,15 @@ def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADDPv")>;
 // ASIMD FP compare, Q-form
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
-                                                "^FCMGTv", "^FCMLEv",
-                                                "^FCMLTv")>;
+                                                 "^FCMGTv", "^FCMLEv",
+                                                 "^FCMLTv")>;
+
+// ASIMD FP round, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
 
 // ASIMD FP convert, long
 // ASIMD FP convert, narrow
@@ -512,14 +1383,26 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
 // ASIMD FP convert, other, Q-form
 // NOTE: Handled by WriteV.
 
+// ASIMD FP convert, long and narrow
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+      (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+      (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
 // ASIMD FP divide, D-form, F32
 def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv2f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv2f32")>;
 
 // ASIMD FP divide, Q-form, F32
 def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv4f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv4f32")>;
 
 // ASIMD FP divide, Q-form, F64
 def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVv2f64)>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "FDIVv2f64")>;
 
 // ASIMD FP max/min, normal, D-form
 // ASIMD FP max/min, normal, Q-form
@@ -540,20 +1423,24 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
 // ASIMD FP multiply, Q-form, FZ
 // ASIMD FP multiply, Q-form, no FZ
 def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
 
 // ASIMD FP multiply accumulate, Dform, FZ
 // ASIMD FP multiply accumulate, Dform, no FZ
 // ASIMD FP multiply accumulate, Qform, FZ
 // ASIMD FP multiply accumulate, Qform, no FZ
 def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
 
 // ASIMD FP negate
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FNEGv")>;
 
-// ASIMD FP round, D-form
-// ASIMD FP round, Q-form
-// NOTE: Handled by WriteV.
-
 //--
 // 3.14 ASIMD Miscellaneous Instructions
 //--
@@ -563,37 +1450,66 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>;
 
 // ASIMD bitwise insert, D-form
 // ASIMD bitwise insert, Q-form
-def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^BIFv", "^BITv", "^BSLv")>;
 
 // ASIMD count, D-form
 // ASIMD count, Q-form
-def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^CLSv", "^CLZv", "^CNTv")>;
 
 // ASIMD duplicate, gen reg
 // ASIMD duplicate, element
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CPY")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv.+gpr")>;
 
 // ASIMD extract
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^EXTv")>;
 
 // ASIMD extract narrow
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^XTNv")>;
+
 // ASIMD extract narrow, saturating
-// NOTE: Handled by WriteV.
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>;
 
 // ASIMD insert, element to element
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
 
+// ASIMD transfer, element to gen reg
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>;
+
 // ASIMD move, integer immed
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
 
 // ASIMD move, FP immed
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>;
 
+// ASIMD table lookup, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Four")>;
+
+// ASIMD table lookup, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Four")>;
+
+// ASIMD transpose
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1", "^TRN2")>;
+
+// ASIMD unzip/zip
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>;
+
 // ASIMD reciprocal estimate, D-form
 // ASIMD reciprocal estimate, Q-form
-def : InstRW<[THX2T99Write_5Cyc_F01], 
+def : InstRW<[THX2T99Write_5Cyc_F01],
             (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
-                                   "^FRSQRTEv", "^URSQRTEv")>;
+                       "^FRSQRTEv", "^URSQRTEv")>;
 
 // ASIMD reciprocal step, D-form, FZ
 // ASIMD reciprocal step, D-form, no FZ
@@ -602,7 +1518,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01],
 def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
 
 // ASIMD reverse
-def : InstRW<[THX2T99Write_5Cyc_F01], 
+def : InstRW<[THX2T99Write_5Cyc_F01],
             (instregex "^REV16v", "^REV32v", "^REV64v")>;
 
 // ASIMD table lookup, D-form
@@ -610,135 +1526,135 @@ def : InstRW<[THX2T99Write_5Cyc_F01],
 def : InstRW<[THX2T99Write_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
 
 // ASIMD transfer, element to word or word
-def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^UMOVv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>;
 
 // ASIMD transfer, element to gen reg
-def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "(S|U)MOVv.*")>;
 
 // ASIMD transfer gen reg to element
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
 
 // ASIMD transpose
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
-                                                "^UZP1v", "^UZP2v")>;
+                                                 "^UZP1v", "^UZP2v")>;
 
 // ASIMD unzip/zip
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
 
 //--
-// 3.15 ASIMD Load Instructions 
+// 3.15 ASIMD Load Instructions
 //--
 
 // ASIMD load, 1 element, multiple, 1 reg, D-form
 // ASIMD load, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[THX2T99Write_4Cyc_LS01], 
+def : InstRW<[THX2T99Write_4Cyc_LS01],
             (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr],
             (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 2 reg, D-form
 // ASIMD load, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[THX2T99Write_4Cyc_LS01], 
+def : InstRW<[THX2T99Write_4Cyc_LS01],
             (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr],
             (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 3 reg, D-form
 // ASIMD load, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[THX2T99Write_5Cyc_LS01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01],
             (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr],
             (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 4 reg, D-form
 // ASIMD load, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[THX2T99Write_6Cyc_LS01], 
+def : InstRW<[THX2T99Write_6Cyc_LS01],
             (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr],
             (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, one lane, B/H/S
 // ASIMD load, 1 element, one lane, D
 def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
             (instregex "^LD1i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 1 element, all lanes, D-form, B/H/S
 // ASIMD load, 1 element, all lanes, D-form, D
 // ASIMD load, 1 element, all lanes, Q-form
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
             (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
             (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 2 element, multiple, D-form, B/H/S
 // ASIMD load, 2 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
             (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
             (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 2 element, one lane, B/H
 // ASIMD load, 2 element, one lane, S
 // ASIMD load, 2 element, one lane, D
 def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
             (instregex "^LD2i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 2 element, all lanes, D-form, B/H/S
 // ASIMD load, 2 element, all lanes, D-form, D
 // ASIMD load, 2 element, all lanes, Q-form
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
             (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
             (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 3 element, multiple, D-form, B/H/S
 // ASIMD load, 3 element, multiple, Q-form, B/H/S
 // ASIMD load, 3 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_8Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01],
             (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr],
             (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 3 element, one lone, B/H
 // ASIMD load, 3 element, one lane, S
 // ASIMD load, 3 element, one lane, D
 def : InstRW<[THX2T99Write_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr],
             (instregex "^LD3i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 3 element, all lanes, D-form, B/H/S
 // ASIMD load, 3 element, all lanes, D-form, D
 // ASIMD load, 3 element, all lanes, Q-form, B/H/S
 // ASIMD load, 3 element, all lanes, Q-form, D
-def : InstRW<[THX2T99Write_7Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01],
             (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr],
             (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 4 element, multiple, D-form, B/H/S
 // ASIMD load, 4 element, multiple, Q-form, B/H/S
 // ASIMD load, 4 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_8Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01],
             (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr],
             (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 4 element, one lane, B/H
 // ASIMD load, 4 element, one lane, S
 // ASIMD load, 4 element, one lane, D
 def : InstRW<[THX2T99Write_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr],
             (instregex "^LD4i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 4 element, all lanes, D-form, B/H/S
 // ASIMD load, 4 element, all lanes, D-form, D
 // ASIMD load, 4 element, all lanes, Q-form, B/H/S
 // ASIMD load, 4 element, all lanes, Q-form, D
-def : InstRW<[THX2T99Write_6Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01],
             (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr],
             (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 //--
@@ -747,106 +1663,83 @@ def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr],
 
 // ASIMD store, 1 element, multiple, 1 reg, D-form
 // ASIMD store, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[THX2T99Write_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01],
             (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
             (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 2 reg, D-form
 // ASIMD store, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[THX2T99Write_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01],
             (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
             (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 3 reg, D-form
 // ASIMD store, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[THX2T99Write_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01],
             (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
             (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 4 reg, D-form
 // ASIMD store, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[THX2T99Write_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01],
             (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
             (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, one lane, B/H/S
 // ASIMD store, 1 element, one lane, D
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
             (instregex "^ST1i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST1i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 2 element, multiple, D-form, B/H/S
 // ASIMD store, 2 element, multiple, Q-form, B/H/S
 // ASIMD store, 2 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
             (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 2 element, one lane, B/H/S
 // ASIMD store, 2 element, one lane, D
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
             (instregex "^ST2i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST2i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 3 element, multiple, D-form, B/H/S
 // ASIMD store, 3 element, multiple, Q-form, B/H/S
 // ASIMD store, 3 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
             (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 3 element, one lane, B/H
 // ASIMD store, 3 element, one lane, S
 // ASIMD store, 3 element, one lane, D
 def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST3i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 4 element, multiple, D-form, B/H/S
 // ASIMD store, 4 element, multiple, Q-form, B/H/S
 // ASIMD store, 4 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
             (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 4 element, one lane, B/H
 // ASIMD store, 4 element, one lane, S
 // ASIMD store, 4 element, one lane, D
 def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST4i(8|16|32|64)_POST$")>;
 
-//--
-// 3.17 Cryptography Extensions
-//--
-
-// Crypto AES ops
-def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES")>;
-
-// Crypto polynomial (64x64) multiply long
-def : InstRW<[THX2T99Write_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
-
-// Crypto SHA1 xor ops
-// Crypto SHA1 schedule acceleration ops
-// Crypto SHA256 schedule acceleration op (1 u-op)
-// Crypto SHA256 schedule acceleration op (2 u-ops)
-// Crypto SHA256 hash acceleration ops
-def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA")>;
-
-//--
-// 3.18 CRC
-//--
-
-// CRC checksum ops
-def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32")>;
-
 } // SchedModel = ThunderX2T99Model
+
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 6660f0babb8a..1252f9403812 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -167,6 +167,8 @@ extern "C" void LLVMInitializeAArch64Target() {
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO())
     return llvm::make_unique<AArch64_MachoTargetObjectFile>();
+  if (TT.isOSBinFormatCOFF())
+    return llvm::make_unique<AArch64_COFFTargetObjectFile>();
 
   return llvm::make_unique<AArch64_ELFTargetObjectFile>();
 }
@@ -179,6 +181,8 @@ static std::string computeDataLayout(const Triple &TT,
     return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
   if (TT.isOSBinFormatMachO())
     return "e-m:o-i64:64-i128:128-n32:64-S128";
+  if (TT.isOSBinFormatCOFF())
+    return "e-m:w-i64:64-i128:128-n32:64-S128";
   if (LittleEndian)
     return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
   return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 2c75a3258c1c..fefa7e26b79f 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -36,6 +36,7 @@ public:
 
   ~AArch64TargetMachine() override;
   const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
+  const AArch64Subtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 47e3bce43f6e..9077eb7902fd 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -45,6 +45,9 @@ public:
                          const TargetMachine &TM) const override;
 };
 
+/// This implementation is used for AArch64 COFF targets.
+class AArch64_COFFTargetObjectFile : public TargetLoweringObjectFileCOFF {};
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a4328682b93c..a76f080530bb 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -20,6 +20,23 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64tti"
 
+static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
+                                               cl::init(true), cl::Hidden);
+
+bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
+                                         const Function *Callee) const {
+  const TargetMachine &TM = getTLI()->getTargetMachine();
+
+  const FeatureBitset &CallerBits =
+      TM.getSubtargetImpl(*Caller)->getFeatureBits();
+  const FeatureBitset &CalleeBits =
+      TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+  // Inline a callee if its target-features are a subset of the callers
+  // target-features.
+  return (CallerBits & CalleeBits) == CalleeBits;
+}
+
 /// \brief Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
@@ -631,10 +648,62 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
   return ST->getMaxInterleaveFactor();
 }
 
-void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
+// For Falkor, we want to avoid having too many strided loads in a loop since
+// that can exhaust the HW prefetcher resources.  We adjust the unroller
+// MaxCount preference below to attempt to ensure unrolling doesn't create too
+// many strided loads.
+static void
+getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                              TargetTransformInfo::UnrollingPreferences &UP) {
+  enum { MaxStridedLoads = 7 };
+  auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
+    int StridedLoads = 0;
+    // FIXME? We could make this more precise by looking at the CFG and
+    // e.g. not counting loads in each side of an if-then-else diamond.
+    for (const auto BB : L->blocks()) {
+      for (auto &I : *BB) {
+        LoadInst *LMemI = dyn_cast<LoadInst>(&I);
+        if (!LMemI)
+          continue;
+
+        Value *PtrValue = LMemI->getPointerOperand();
+        if (L->isLoopInvariant(PtrValue))
+          continue;
+
+        const SCEV *LSCEV = SE.getSCEV(PtrValue);
+        const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+        if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
+          continue;
+
+        // FIXME? We could take pairing of unrolled load copies into account
+        // by looking at the AddRec, but we would probably have to limit this
+        // to loops with no stores or other memory optimization barriers.
+        ++StridedLoads;
+        // We've seen enough strided loads that seeing more won't make a
+        // difference.
+        if (StridedLoads > MaxStridedLoads / 2)
+          return StridedLoads;
+      }
+    }
+    return StridedLoads;
+  };
+
+  int StridedLoads = countStridedLoads(L, SE);
+  DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
+               << " strided loads\n");
+  // Pick the largest power of 2 unroll count that won't result in too many
+  // strided loads.
+  if (StridedLoads) {
+    UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
+    DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount
+                 << '\n');
+  }
+}
+
+void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP) {
   // Enable partial unrolling and runtime unrolling.
-  BaseT::getUnrollingPreferences(L, UP);
+  BaseT::getUnrollingPreferences(L, SE, UP);
 
   // For inner loop, it is more likely to be a hot one, and the runtime check
   // can be promoted out from LICM pass, so the overhead is less, let's try
@@ -644,6 +713,10 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
 
   // Disable partial & runtime unrolling on -Os.
   UP.PartialOptSizeThreshold = 0;
+
+  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
+      EnableFalkorHWPFUnrollFix)
+    getFalkorUnrollingPreferences(L, SE, UP);
 }
 
 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 290a1ca1f24b..31c037354925 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -51,6 +51,9 @@ public:
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
+  bool areInlineCompatible(const Function *Caller,
+                           const Function *Callee) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
@@ -119,7 +122,8 @@ public:
 
   int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                            Type *ExpectedType);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 3d075018904c..475f91016840 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -541,14 +541,13 @@ public:
     return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32);
   }
 
-  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
-                         const MCValue &Target, bool &IsResolved) override;
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
 };
 
-void ELFAArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
-                                             const MCFixup &Fixup,
-                                             const MCValue &Target,
-                                             bool &IsResolved) {
+bool ELFAArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                                 const MCFixup &Fixup,
+                                                 const MCValue &Target) {
   // The ADRP instruction adds some multiple of 0x1000 to the current PC &
   // ~0xfff. This means that the required offset to reach a symbol can vary by
   // up to one step depending on where the ADRP is in memory. For example:
@@ -562,11 +561,24 @@ void ELFAArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
   // section isn't 0x1000-aligned, we therefore need to delegate this decision
   // to the linker -- a relocation!
   if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
-    IsResolved = false;
+    return true;
+  return false;
 }
 
 }
 
+namespace {
+class COFFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+  COFFAArch64AsmBackend(const Target &T, const Triple &TheTriple)
+      : AArch64AsmBackend(T, /*IsLittleEndian*/true) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createAArch64WinCOFFObjectWriter(OS);
+  }
+};
+}
+
 MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
                                               const MCRegisterInfo &MRI,
                                               const Triple &TheTriple,
@@ -575,7 +587,11 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
   if (TheTriple.isOSBinFormatMachO())
     return new DarwinAArch64AsmBackend(T, MRI);
 
-  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+  if (TheTriple.isOSBinFormatCOFF())
+    return new COFFAArch64AsmBackend(T, TheTriple);
+
+  assert(TheTriple.isOSBinFormatELF() && "Invalid target");
+
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
   bool IsILP32 = Options.getABIName() == "ilp32";
   return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true, IsILP32);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index f7dda92fb551..89c3e5b4c76e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -49,10 +49,11 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
                               /*HasRelocationAddend*/ true),
       IsILP32(IsILP32) {}
 
-#define R_CLS(rtype) \
-        IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
-#define BAD_ILP32_MOV(lp64rtype) "ILP32 absolute MOV relocation not "\
-        "supported (LP64 eqv: " #lp64rtype ")"
+#define R_CLS(rtype)                                                           \
+  IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
+#define BAD_ILP32_MOV(lp64rtype)                                               \
+  "ILP32 absolute MOV relocation not "                                         \
+  "supported (LP64 eqv: " #lp64rtype ")"
 
 // assumes IsILP32 is true
 static bool isNonILP32reloc(const MCFixup &Fixup,
@@ -60,44 +61,45 @@ static bool isNonILP32reloc(const MCFixup &Fixup,
                             MCContext &Ctx) {
   if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw)
     return false;
-  switch(RefKind) {
-    case AArch64MCExpr::VK_ABS_G3:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3));
-      return true;
-    case AArch64MCExpr::VK_ABS_G2:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2));
-      return true;
-    case AArch64MCExpr::VK_ABS_G2_S:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2));
-      return true;
-    case AArch64MCExpr::VK_ABS_G2_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC));
-      return true;
-    case AArch64MCExpr::VK_ABS_G1_S:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1));
-      return true;
-    case AArch64MCExpr::VK_ABS_G1_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC));
-      return true;
-    case AArch64MCExpr::VK_DTPREL_G2:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2));
-      return true;
-    case AArch64MCExpr::VK_DTPREL_G1_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC));
-      return true;
-    case AArch64MCExpr::VK_TPREL_G2:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2));
-      return true;
-    case AArch64MCExpr::VK_TPREL_G1_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC));
-      return true;
-    case AArch64MCExpr::VK_GOTTPREL_G1:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1));
-      return true;
-    case AArch64MCExpr::VK_GOTTPREL_G0_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC));
-      return true;
-    default: return false;
+  switch (RefKind) {
+  case AArch64MCExpr::VK_ABS_G3:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3));
+    return true;
+  case AArch64MCExpr::VK_ABS_G2:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2));
+    return true;
+  case AArch64MCExpr::VK_ABS_G2_S:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2));
+    return true;
+  case AArch64MCExpr::VK_ABS_G2_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC));
+    return true;
+  case AArch64MCExpr::VK_ABS_G1_S:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1));
+    return true;
+  case AArch64MCExpr::VK_ABS_G1_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC));
+    return true;
+  case AArch64MCExpr::VK_DTPREL_G2:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2));
+    return true;
+  case AArch64MCExpr::VK_DTPREL_G1_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC));
+    return true;
+  case AArch64MCExpr::VK_TPREL_G2:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2));
+    return true;
+  case AArch64MCExpr::VK_TPREL_G1_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC));
+    return true;
+  case AArch64MCExpr::VK_GOTTPREL_G1:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1));
+    return true;
+  case AArch64MCExpr::VK_GOTTPREL_G0_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC));
+    return true;
+  default:
+    return false;
   }
   return false;
 }
@@ -130,7 +132,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       return R_CLS(PREL32);
     case FK_Data_8:
       if (IsILP32) {
-        Ctx.reportError(Fixup.getLoc(), "ILP32 8 byte PC relative data "
+        Ctx.reportError(Fixup.getLoc(),
+                        "ILP32 8 byte PC relative data "
                         "relocation not supported (LP64 eqv: PREL64)");
         return ELF::R_AARCH64_NONE;
       } else
@@ -178,7 +181,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
     }
   } else {
     if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
-        return ELF::R_AARCH64_NONE;
+      return ELF::R_AARCH64_NONE;
     switch ((unsigned)Fixup.getKind()) {
     case FK_Data_1:
       Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
@@ -189,8 +192,9 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       return R_CLS(ABS32);
     case FK_Data_8:
       if (IsILP32) {
-        Ctx.reportError(Fixup.getLoc(), "ILP32 8 byte absolute data "
-			"relocation not supported (LP64 eqv: ABS64)");
+        Ctx.reportError(Fixup.getLoc(),
+                        "ILP32 8 byte absolute data "
+                        "relocation not supported (LP64 eqv: ABS64)");
         return ELF::R_AARCH64_NONE;
       } else
         return ELF::R_AARCH64_ABS64;
@@ -262,7 +266,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         } else {
           Ctx.reportError(Fixup.getLoc(),
                           "LP64 4 byte unchecked GOT load/store relocation "
-			  "not supported (ILP32 eqv: LD32_GOT_LO12_NC");
+                          "not supported (ILP32 eqv: LD32_GOT_LO12_NC");
           return ELF::R_AARCH64_NONE;
         }
       }
@@ -270,12 +274,12 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         if (IsILP32) {
           Ctx.reportError(Fixup.getLoc(),
                           "ILP32 4 byte checked GOT load/store relocation "
-			  "not supported (unchecked eqv: LD32_GOT_LO12_NC)");
+                          "not supported (unchecked eqv: LD32_GOT_LO12_NC)");
         } else {
           Ctx.reportError(Fixup.getLoc(),
                           "LP64 4 byte checked GOT load/store relocation "
-			  "not supported (unchecked/ILP32 eqv: "
-			  "LD32_GOT_LO12_NC)");
+                          "not supported (unchecked/ILP32 eqv: "
+                          "LD32_GOT_LO12_NC)");
         }
         return ELF::R_AARCH64_NONE;
       }
@@ -283,7 +287,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         if (IsILP32) {
           return ELF::R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC;
         } else {
-          Ctx.reportError(Fixup.getLoc(), "LP64 32-bit load/store "
+          Ctx.reportError(Fixup.getLoc(),
+                          "LP64 32-bit load/store "
                           "relocation not supported (ILP32 eqv: "
                           "TLSIE_LD32_GOTTPREL_LO12_NC)");
           return ELF::R_AARCH64_NONE;
@@ -295,14 +300,14 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         } else {
           Ctx.reportError(Fixup.getLoc(),
                           "LP64 4 byte TLSDESC load/store relocation "
-			  "not supported (ILP32 eqv: TLSDESC_LD64_LO12)");
+                          "not supported (ILP32 eqv: TLSDESC_LD64_LO12)");
           return ELF::R_AARCH64_NONE;
         }
       }
 
       Ctx.reportError(Fixup.getLoc(),
                       "invalid fixup for 32-bit load/store instruction "
-		      "fixup_aarch64_ldst_imm12_scale4");
+                      "fixup_aarch64_ldst_imm12_scale4");
       return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale8:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
@@ -312,8 +317,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
           return ELF::R_AARCH64_LD64_GOT_LO12_NC;
         } else {
           Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
-                          "relocation not supported (LP64 eqv: "
-                          "LD64_GOT_LO12_NC)");
+                                          "relocation not supported (LP64 eqv: "
+                                          "LD64_GOT_LO12_NC)");
           return ELF::R_AARCH64_NONE;
         }
       }
@@ -330,8 +335,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
           return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
         } else {
           Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
-                          "relocation not supported (LP64 eqv: "
-                          "TLSIE_LD64_GOTTPREL_LO12_NC)");
+                                          "relocation not supported (LP64 eqv: "
+                                          "TLSIE_LD64_GOTTPREL_LO12_NC)");
           return ELF::R_AARCH64_NONE;
         }
       }
@@ -340,8 +345,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
           return ELF::R_AARCH64_TLSDESC_LD64_LO12;
         } else {
           Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
-                          "relocation not supported (LP64 eqv: "
-                          "TLSDESC_LD64_LO12)");
+                                          "relocation not supported (LP64 eqv: "
+                                          "TLSDESC_LD64_LO12)");
           return ELF::R_AARCH64_NONE;
         }
       }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 031aa8b81e35..a0de3c39562b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetStreamer.h"
+#include "AArch64WinCOFFStreamer.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
@@ -30,6 +31,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
@@ -210,6 +212,8 @@ createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   const Triple &TT = STI.getTargetTriple();
   if (TT.isOSBinFormatELF())
     return new AArch64TargetELFStreamer(S);
+  if (TT.isOSBinFormatCOFF())
+    return new AArch64TargetWinCOFFStreamer(S);
   return nullptr;
 }
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index 0f5b765c7697..4293dcba955e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -16,53 +16,47 @@ namespace llvm {
 namespace AArch64 {
 
 enum Fixups {
-  // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADR instruction.
+  // A 21-bit pc-relative immediate inserted into an ADR instruction.
   fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
 
-  // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADRP instruction.
+  // A 21-bit pc-relative immediate inserted into an ADRP instruction.
   fixup_aarch64_pcrel_adrp_imm21,
 
-  // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
-  //     No alignment adjustment. All value bits are encoded.
+  // 12-bit fixup for add/sub instructions. No alignment adjustment. All value
+  // bits are encoded.
   fixup_aarch64_add_imm12,
 
-  // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
-  // store instructions.
+  // unsigned 12-bit fixups for load and store instructions.
   fixup_aarch64_ldst_imm12_scale1,
   fixup_aarch64_ldst_imm12_scale2,
   fixup_aarch64_ldst_imm12_scale4,
   fixup_aarch64_ldst_imm12_scale8,
   fixup_aarch64_ldst_imm12_scale16,
 
-  // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
-  // pc-relative loads and generates relocations directly when necessary.
+  // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
+  // fixup_aarch64_pcrel_adrhi, except this is used by pc-relative loads and
+  // generates relocations directly when necessary.
   fixup_aarch64_ldr_pcrel_imm19,
 
   // FIXME: comment
   fixup_aarch64_movw,
 
-  // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
-  // immediate.
+  // The high 14 bits of a 21-bit pc-relative immediate.
   fixup_aarch64_pcrel_branch14,
 
-  // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
-  // b.cc and generates relocations directly when necessary.
+  // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
+  // fixup_aarch64_pcrel_adrhi, except this is use by b.cc and generates
+  // relocations directly when necessary.
   fixup_aarch64_pcrel_branch19,
 
-  // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
-  // immediate.
+  // The high 26 bits of a 28-bit pc-relative immediate.
   fixup_aarch64_pcrel_branch26,
 
-  // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
-  // immediate. Distinguished from branch26 only on ELF.
+  // The high 26 bits of a 28-bit pc-relative immediate. Distinguished from
+  // branch26 only on ELF.
   fixup_aarch64_pcrel_call26,
 
-  // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
-  // R_AARCH64_TLSDESC_CALL relocation.
+  // zero-space placeholder for the ELF R_AARCH64_TLSDESC_CALL relocation.
   fixup_aarch64_tlsdesc_call,
 
   // Marker
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 1b28df963b40..fc808ee0cdd6 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -100,3 +100,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
 
   HasIdentDirective = true;
 }
+
+AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() {
+  CommentString = ";";
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 253cd30f26ee..2d7107a37244 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
+#include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 
@@ -33,6 +34,10 @@ struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
   explicit AArch64MCAsmInfoELF(const Triple &T);
 };
 
+struct AArch64MCAsmInfoCOFF : public MCAsmInfoCOFF {
+  explicit AArch64MCAsmInfoCOFF();
+};
+
 } // namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index f710065d9bc7..a2555496cdb9 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -14,6 +14,7 @@
 #include "AArch64MCTargetDesc.h"
 #include "AArch64ELFStreamer.h"
 #include "AArch64MCAsmInfo.h"
+#include "AArch64WinCOFFStreamer.h"
 #include "InstPrinter/AArch64InstPrinter.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -59,8 +60,10 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI;
   if (TheTriple.isOSBinFormatMachO())
     MAI = new AArch64MCAsmInfoDarwin();
+  else if (TheTriple.isOSBinFormatCOFF())
+    MAI = new AArch64MCAsmInfoCOFF();
   else {
-    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    assert(TheTriple.isOSBinFormatELF() && "Invalid target");
     MAI = new AArch64MCAsmInfoELF(TheTriple);
   }
 
@@ -74,8 +77,8 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
 
 static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
                               CodeModel::Model &CM) {
-  assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) &&
-         "Only expect Darwin and ELF targets");
+  assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO() ||
+          TT.isOSBinFormatCOFF()) && "Invalid target");
 
   if (CM == CodeModel::Default)
     CM = CodeModel::Small;
@@ -122,6 +125,14 @@ static MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
                              /*LabelSections*/ true);
 }
 
+static MCStreamer *createWinCOFFStreamer(MCContext &Ctx, MCAsmBackend &TAB,
+                                         raw_pwrite_stream &OS,
+                                         MCCodeEmitter *Emitter, bool RelaxAll,
+                                         bool IncrementalLinkerCompatible) {
+  return createAArch64WinCOFFStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                                      IncrementalLinkerCompatible);
+}
+
 static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
   return new MCInstrAnalysis(Info);
 }
@@ -154,6 +165,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
     // Register the obj streamers.
     TargetRegistry::RegisterELFStreamer(*T, createELFStreamer);
     TargetRegistry::RegisterMachOStreamer(*T, createMachOStreamer);
+    TargetRegistry::RegisterCOFFStreamer(*T, createWinCOFFStreamer);
 
     // Register the obj target streamer.
     TargetRegistry::RegisterObjectTargetStreamer(
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 615d7dab2c51..1404926b8124 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -60,6 +60,8 @@ MCObjectWriter *createAArch64MachObjectWriter(raw_pwrite_stream &OS,
                                               uint32_t CPUType,
                                               uint32_t CPUSubtype);
 
+MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS);
+
 MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
                                                  MCInstPrinter *InstPrint,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
new file mode 100644
index 000000000000..7862a03e771c
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -0,0 +1,65 @@
+//= AArch64WinCOFFObjectWriter.cpp - AArch64 Windows COFF Object Writer C++ =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+  AArch64WinCOFFObjectWriter()
+    : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {
+  }
+
+  ~AArch64WinCOFFObjectWriter() override = default;
+
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsCrossSection,
+                        const MCAsmBackend &MAB) const override;
+
+   bool recordRelocation(const MCFixup &) const override;
+};
+
+} // end anonymous namespace
+
+unsigned
+AArch64WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+                                         const MCValue &Target,
+                                         const MCFixup &Fixup,
+                                         bool IsCrossSection,
+                                         const MCAsmBackend &MAB) const {
+  const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
+  report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+}
+
+bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
+  return true;
+}
+
+namespace llvm {
+
+MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS) {
+  MCWinCOFFObjectTargetWriter *MOTW = new AArch64WinCOFFObjectWriter();
+  return createWinCOFFObjectWriter(MOTW, OS);
+}
+
+} // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
new file mode 100644
index 000000000000..6c8da27e398f
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -0,0 +1,37 @@
+//===-- AArch64WinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64WinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+
+class AArch64WinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+  friend class AArch64TargetWinCOFFStreamer;
+
+  AArch64WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE,
+                        raw_pwrite_stream &OS)
+      : MCWinCOFFStreamer(C, AB, CE, OS) {}
+};
+} // end anonymous namespace
+
+namespace llvm {
+MCWinCOFFStreamer
+*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                              raw_pwrite_stream &OS,
+                              MCCodeEmitter *Emitter, bool RelaxAll,
+                              bool IncrementalLinkerCompatible) {
+  auto *S = new AArch64WinCOFFStreamer(Context, MAB, *Emitter, OS);
+  S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+  return S;
+}
+
+} // end llvm namespace
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
new file mode 100644
index 000000000000..1b4fcd6804e2
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
@@ -0,0 +1,43 @@
+//===-- AArch64WinCOFFStreamer.h - WinCOFF Streamer for AArch64 -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements WinCOFF streamer information for the AArch64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H
+
+#include "AArch64TargetStreamer.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+namespace {
+class AArch64WinCOFFStreamer;
+
+class AArch64TargetWinCOFFStreamer : public llvm::AArch64TargetStreamer {
+private:
+  AArch64WinCOFFStreamer &getStreamer();
+
+public:
+  AArch64TargetWinCOFFStreamer(llvm::MCStreamer &S)
+    : AArch64TargetStreamer(S) {}
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+MCWinCOFFStreamer
+*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                              raw_pwrite_stream &OS,
+                              MCCodeEmitter *Emitter, bool RelaxAll,
+                              bool IncrementalLinkerCompatible);
+} // end llvm namespace
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
index 6d8be5e63fbb..56eeba8a1d4b 100644
--- a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
@@ -8,6 +8,8 @@ add_llvm_library(LLVMAArch64Desc
   AArch64MCTargetDesc.cpp
   AArch64MachObjectWriter.cpp
   AArch64TargetStreamer.cpp
+  AArch64WinCOFFObjectWriter.cpp
+  AArch64WinCOFFStreamer.cpp
 )
 add_dependencies(LLVMAArch64Desc AArch64CommonTableGen)
 
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 55d18c3f3646..5a799b2d88d0 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -36,7 +36,6 @@ FunctionPass *createR600ControlFlowFinalizer();
 FunctionPass *createAMDGPUCFGStructurizerPass();
 
 // SI Passes
-FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSIPeepholeSDWAPass();
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 7494e5decd6f..f1d899c4d003 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -262,8 +262,8 @@ def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
   "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
 >;
 
-def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc",
-  "HasSDWAClampVOPC",
+def FeatureSDWAOutModsVOPC : SubtargetFeature<"sdwa-out-mods-vopc",
+  "HasSDWAOutModsVOPC",
   "true",
   "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
 >;
@@ -452,7 +452,7 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
    FeatureScalarStores, FeatureInv2PiInlineImm,
-   FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP
+   FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP
   ]
 >;
 
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 2071b6f157cd..9a391d06c9ea 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -1776,7 +1776,7 @@ static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
                                           E = EndMBB->succ_end();
          PI != E; ++PI) {
       // Either we have a back-edge to the entry block, or a back-edge to the
-      // succesor of the entry block since the block may be split.
+      // successor of the entry block since the block may be split.
       if ((*PI) != StartMBB &&
           !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) {
         Succs.insert(
@@ -1831,7 +1831,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
   IfBB->addSuccessor(CodeBBStart);
 
   DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
-  // Ensure that the MergeBB is a succesor of the CodeEndBB.
+  // Ensure that the MergeBB is a successor of the CodeEndBB.
   if (!CodeBBEnd->isSuccessor(MergeBB))
     CodeBBEnd->addSuccessor(MergeBB);
 
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index ab5abf2039a5..be47b900c6f0 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -128,7 +128,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasSDWAScalar(false),
     HasSDWASdst(false),
     HasSDWAMac(false),
-    HasSDWAClampVOPC(false),
+    HasSDWAOutModsVOPC(false),
     HasDPP(false),
     FlatAddressSpace(false),
     FlatInstOffsets(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 2b16289c723e..22cede59086a 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -153,7 +153,7 @@ protected:
   bool HasSDWAScalar;
   bool HasSDWASdst;
   bool HasSDWAMac;
-  bool HasSDWAClampVOPC;
+  bool HasSDWAOutModsVOPC;
   bool HasDPP;
   bool FlatAddressSpace;
   bool FlatInstOffsets;
@@ -452,8 +452,8 @@ public:
     return HasSDWAMac;
   }
 
-  bool hasSDWAClampVOPC() const {
-    return HasSDWAClampVOPC;
+  bool hasSDWAOutModsVOPC() const {
+    return HasSDWAOutModsVOPC;
   }
 
   /// \brief Returns the offset in bytes from the start of the input buffer
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 04fe9f689806..425fd35d47de 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -720,7 +720,6 @@ bool GCNPassConfig::addPreISel() {
     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
   }
   addPass(createSinkingPass());
-  addPass(createSITypeRewriter());
   addPass(createAMDGPUAnnotateUniformValues());
   if (!LateCFGStructurize) {
     addPass(createSIAnnotateControlFlowPass());
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 88245b01683a..89a03902dc69 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -63,7 +63,7 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
   return false;
 }
 
-void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
+void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                             TTI::UnrollingPreferences &UP) {
   UP.Threshold = 300; // Twice the default.
   UP.MaxCount = UINT_MAX;
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 485e20411ab4..9a320bdfcc3d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -68,7 +68,8 @@ public:
 
   bool hasBranchDivergence() { return true; }
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index e30844f082cd..917d9cfa6905 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -96,7 +96,6 @@ add_llvm_target(AMDGPUCodeGen
   SIPeepholeSDWA.cpp
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
-  SITypeRewriter.cpp
   SIWholeQuadMode.cpp
   GCNIterativeScheduler.cpp
   GCNMinRegStrategy.cpp
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 04308fb3aaf6..f26e49295e69 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -626,7 +626,9 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
   using namespace AMDGPU::SDWA;
 
   if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
-    if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
+    // XXX: static_cast<int> is needed to avoid stupid warning:
+    // compare with unsigned is always true
+    if (SDWA9EncValues::SRC_VGPR_MIN <= static_cast<int>(Val) &&
         Val <= SDWA9EncValues::SRC_VGPR_MAX) {
       return createRegOperand(getVgprClassId(Width),
                               Val - SDWA9EncValues::SRC_VGPR_MIN);
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index d0f4e00994de..d39b345bdf03 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4314,6 +4314,23 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
   return SDValue();
 }
 
+// Returns true if argument is a boolean value which is not serialized into
+// memory or argument and does not require v_cmdmask_b32 to be deserialized.
+static bool isBoolSGPR(SDValue V) {
+  if (V.getValueType() != MVT::i1)
+    return false;
+  switch (V.getOpcode()) {
+  default: break;
+  case ISD::SETCC:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case AMDGPUISD::FP_CLASS:
+    return true;
+  }
+  return false;
+}
+
 SDValue SITargetLowering::performAndCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   if (DCI.isBeforeLegalize())
@@ -4402,6 +4419,16 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
     }
   }
 
+  if (VT == MVT::i32 &&
+      (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
+    // and x, (sext cc from i1) => select cc, x, 0
+    if (RHS.getOpcode() != ISD::SIGN_EXTEND)
+      std::swap(LHS, RHS);
+    if (isBoolSGPR(RHS.getOperand(0)))
+      return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
+                           LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
+  }
+
   return SDValue();
 }
 
@@ -4941,8 +4968,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
   case ISD::SIGN_EXTEND:
   case ISD::ANY_EXTEND: {
     auto Cond = RHS.getOperand(0);
-    if (Cond.getOpcode() != ISD::SETCC &&
-        Cond.getOpcode() != AMDGPUISD::FP_CLASS)
+    if (!isBoolSGPR(Cond))
       break;
     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
     SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
@@ -5109,6 +5135,35 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   EVT VT = LHS.getValueType();
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+  auto CRHS = dyn_cast<ConstantSDNode>(RHS);
+  if (!CRHS) {
+    CRHS = dyn_cast<ConstantSDNode>(LHS);
+    if (CRHS) {
+      std::swap(LHS, RHS);
+      CC = getSetCCSwappedOperands(CC);
+    }
+  }
+
+  if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
+      isBoolSGPR(LHS.getOperand(0))) {
+    // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
+    // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
+    // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
+    // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
+    if ((CRHS->isAllOnesValue() &&
+         (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
+        (CRHS->isNullValue() &&
+         (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
+      return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+                         DAG.getConstant(-1, SL, MVT::i1));
+    if ((CRHS->isAllOnesValue() &&
+         (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
+        (CRHS->isNullValue() &&
+         (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
+      return LHS.getOperand(0);
+  }
 
   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
                                            VT != MVT::f16))
@@ -5116,7 +5171,6 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
 
   // Match isinf pattern
   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
-  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
     if (!CRHS)
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index c9b48fea7225..b6784ec14e9f 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -770,7 +770,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
     if (ST.hasScalarStores()) {
       // m0 is used for offset to scalar stores if used to spill.
-      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
     }
 
     return;
@@ -871,7 +871,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
     if (ST.hasScalarStores()) {
       // m0 is used for offset to scalar stores if used to spill.
-      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
     }
 
     return;
@@ -2444,8 +2444,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
 
     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
-    if ( DstIdx == -1)
-      DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::sdst);
 
     const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
 
@@ -2488,14 +2486,20 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
           ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
           return false;
         }
-      } else if (!ST.hasSDWAClampVOPC()) {
+      } else if (!ST.hasSDWAOutModsVOPC()) {
         // No clamp allowed on GFX9 for VOPC
         const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
-        if (Clamp != nullptr &&
-          (!Clamp->isImm() || Clamp->getImm() != 0)) {
+        if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
           ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
           return false;
         }
+
+        // No omod allowed on GFX9 for VOPC
+        const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
+        if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
+          ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
+          return false;
+        }
       }
     }
   }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 3b4a8b5d1e81..4a81fb3b463a 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -336,6 +336,10 @@ def NegSubInlineConst16 : ImmLeaf<i16, [{
   return Imm < -16 && Imm >= -64;
 }], NegateImm>;
 
+def ShiftAmt32Imm : PatLeaf <(imm), [{
+  return N->getZExtValue() < 32;
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Custom Operands
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 3b4bdc864253..bcc685015cf5 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -929,6 +929,14 @@ def : UMad24Pat<V_MAD_U32_U24>;
 defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
 
+def : Pat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
+          (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
+
+def : Pat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
+          (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
+
 /********** ====================== **********/
 /**********   Indirect addressing  **********/
 /********** ====================== **********/
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 4ac23ef03cb3..e2ac6631d2f3 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -627,10 +627,13 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
         return false;
     }
 
-    if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+    if (!ST.hasSDWAOutModsVOPC() &&
+        (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
+         TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
       return false;
 
-  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
+             !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
     return false;
   }
 
@@ -649,25 +652,24 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode()));
   assert(SDWAOpcode != -1);
 
-  // Copy dst, if it is present in original then should also be present in SDWA
-  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-  if (!Dst && !TII->isVOPC(MI))
-    return false;
-
   const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
 
   // Create SDWA version of instruction MI and initialize its operands
   MachineInstrBuilder SDWAInst =
     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
 
+  // Copy dst, if it is present in original then should also be present in SDWA
+  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
   if (Dst) {
     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
     SDWAInst.add(*Dst);
-  } else {
-    Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+  } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
     assert(Dst &&
            AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
     SDWAInst.add(*Dst);
+  } else {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
+    SDWAInst.addReg(AMDGPU::VCC, RegState::Define);
   }
 
   // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
@@ -714,20 +716,22 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
   }
 
   // Copy omod if present, initialize otherwise if needed
-  MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
-  if (OMod) {
-    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1);
-    SDWAInst.add(*OMod);
-  } else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
-    SDWAInst.addImm(0);
+  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
+    MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
+    if (OMod) {
+      SDWAInst.add(*OMod);
+    } else {
+      SDWAInst.addImm(0);
+    }
   }
 
-  // Initialize dst_sel and dst_unused if present
-  if (Dst) {
-    assert(
-      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 &&
-      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1);
+  // Initialize dst_sel if present
+  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
     SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+  }
+
+  // Initialize dst_unused if present
+  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
     SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
   }
 
diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp
deleted file mode 100644
index aad68537f779..000000000000
--- a/lib/Target/AMDGPU/SITypeRewriter.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass removes performs the following type substitution on all
-/// non-compute shaders:
-///
-/// v16i8 => i128
-///   - v16i8 is used for constant memory resource descriptors.  This type is
-///      legal for some compute APIs, and we don't want to declare it as legal
-///      in the backend, because we want the legalizer to expand all v16i8
-///      operations.
-/// v1* => *
-///   - Having v1* types complicates the legalizer and we can easily replace
-///   - them with the element type.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-
-using namespace llvm;
-
-namespace {
-
-class SITypeRewriter : public FunctionPass,
-                       public InstVisitor<SITypeRewriter> {
-
-  static char ID;
-  Module *Mod;
-  Type *v16i8;
-  Type *v4i32;
-
-public:
-  SITypeRewriter() : FunctionPass(ID) { }
-  bool doInitialization(Module &M) override;
-  bool runOnFunction(Function &F) override;
-  StringRef getPassName() const override { return "SI Type Rewriter"; }
-  void visitLoadInst(LoadInst &I);
-  void visitCallInst(CallInst &I);
-  void visitBitCast(BitCastInst &I);
-};
-
-} // End anonymous namespace
-
-char SITypeRewriter::ID = 0;
-
-bool SITypeRewriter::doInitialization(Module &M) {
-  Mod = &M;
-  v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
-  v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
-  return false;
-}
-
-bool SITypeRewriter::runOnFunction(Function &F) {
-  if (!AMDGPU::isShader(F.getCallingConv()))
-    return false;
-
-  visit(F);
-  visit(F);
-
-  return false;
-}
-
-void SITypeRewriter::visitLoadInst(LoadInst &I) {
-  Value *Ptr = I.getPointerOperand();
-  Type *PtrTy = Ptr->getType();
-  Type *ElemTy = PtrTy->getPointerElementType();
-  IRBuilder<> Builder(&I);
-  if (ElemTy == v16i8)  {
-    Value *BitCast = Builder.CreateBitCast(Ptr,
-        PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
-    LoadInst *Load = Builder.CreateLoad(BitCast);
-    SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
-    I.getAllMetadataOtherThanDebugLoc(MD);
-    for (unsigned i = 0, e = MD.size(); i != e; ++i) {
-      Load->setMetadata(MD[i].first, MD[i].second);
-    }
-    Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
-    I.replaceAllUsesWith(BitCastLoad);
-    I.eraseFromParent();
-  }
-}
-
-void SITypeRewriter::visitCallInst(CallInst &I) {
-  IRBuilder<> Builder(&I);
-
-  SmallVector <Value*, 8> Args;
-  SmallVector <Type*, 8> Types;
-  bool NeedToReplace = false;
-  Function *F = I.getCalledFunction();
-  if (!F)
-    return;
-
-  std::string Name = F->getName();
-  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
-    Value *Arg = I.getArgOperand(i);
-    if (Arg->getType() == v16i8) {
-      Args.push_back(Builder.CreateBitCast(Arg, v4i32));
-      Types.push_back(v4i32);
-      NeedToReplace = true;
-      Name = Name + ".v4i32";
-    } else if (Arg->getType()->isVectorTy() &&
-               Arg->getType()->getVectorNumElements() == 1 &&
-               Arg->getType()->getVectorElementType() ==
-                                              Type::getInt32Ty(I.getContext())){
-      Type *ElementTy = Arg->getType()->getVectorElementType();
-      std::string TypeName = "i32";
-      InsertElementInst *Def = cast<InsertElementInst>(Arg);
-      Args.push_back(Def->getOperand(1));
-      Types.push_back(ElementTy);
-      std::string VecTypeName = "v1" + TypeName;
-      Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName);
-      NeedToReplace = true;
-    } else {
-      Args.push_back(Arg);
-      Types.push_back(Arg->getType());
-    }
-  }
-
-  if (!NeedToReplace) {
-    return;
-  }
-  Function *NewF = Mod->getFunction(Name);
-  if (!NewF) {
-    NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
-    NewF->setAttributes(F->getAttributes());
-  }
-  I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
-  I.eraseFromParent();
-}
-
-void SITypeRewriter::visitBitCast(BitCastInst &I) {
-  IRBuilder<> Builder(&I);
-  if (I.getDestTy() != v4i32) {
-    return;
-  }
-
-  if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
-    if (Op->getSrcTy() == v4i32) {
-      I.replaceAllUsesWith(Op->getOperand(0));
-      I.eraseFromParent();
-    }
-  }
-}
-
-FunctionPass *llvm::createSITypeRewriter() {
-  return new SITypeRewriter();
-}
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 6f67183df6a1..c40b4450a5b5 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -222,6 +222,13 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
 def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
                                      "Has return address stack">;
 
+// Some processors have no branch predictor, which changes the expected cost of
+// taking a branch which affects the choice of whether to use predicated
+// instructions.
+def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
+                                                   "HasBranchPredictor", "false",
+                                                   "Has no branch predictor">;
+
 /// DSP extension.
 def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
                               "Supports DSP instructions in ARM and/or Thumb2">;
@@ -262,6 +269,10 @@ def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
                                         "Generate calls via indirect call "
                                         "instructions">;
 
+def FeatureExecuteOnly
+    : SubtargetFeature<"execute-only", "GenExecuteOnly", "true",
+                       "Enable the generation of execute only code.">;
+
 def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
                                         "Reserve R9, making it unavailable as "
                                         "GPR">;
@@ -540,7 +551,7 @@ def ARMv7s   : Architecture<"armv7s",      "ARMv7a",   [ARMv7a]>;
 //
 
 // Dummy CPU, used to target architectures
-def : ProcNoItin<"generic",                             []>;
+def : ProcessorModel<"generic",     CortexA8Model,      []>;
 
 def : ProcNoItin<"arm8",                                [ARMv4]>;
 def : ProcNoItin<"arm810",                              [ARMv4]>;
@@ -756,13 +767,19 @@ def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureAvoidPartialCPSR]>;
 
-def : ProcNoItin<"cortex-m3",                           [ARMv7m, ProcM3]>;
-def : ProcNoItin<"sc300",                               [ARMv7m, ProcM3]>;
+def : ProcessorModel<"cortex-m3", CortexM3Model,        [ARMv7m,
+                                                         ProcM3,
+                                                         FeatureHasNoBranchPredictor]>;
+
+def : ProcessorModel<"sc300",     CortexM3Model,        [ARMv7m,
+                                                         ProcM3,
+                                                         FeatureHasNoBranchPredictor]>;
 
-def : ProcNoItin<"cortex-m4",                           [ARMv7em,
+def : ProcessorModel<"cortex-m4", CortexM3Model,        [ARMv7em,
                                                          FeatureVFP4,
                                                          FeatureVFPOnlySP,
-                                                         FeatureD16]>;
+                                                         FeatureD16,
+                                                         FeatureHasNoBranchPredictor]>;
 
 def : ProcNoItin<"cortex-m7",                           [ARMv7em,
                                                          FeatureFPARMv8,
@@ -771,11 +788,12 @@ def : ProcNoItin<"cortex-m7",                           [ARMv7em,
 def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,
                                                          FeatureNoMovt]>;
 
-def : ProcNoItin<"cortex-m33",                          [ARMv8mMainline,
+def : ProcessorModel<"cortex-m33", CortexM3Model,       [ARMv8mMainline,
                                                          FeatureDSP,
                                                          FeatureFPARMv8,
                                                          FeatureD16,
-                                                         FeatureVFPOnlySP]>;
+                                                         FeatureVFPOnlySP,
+                                                         FeatureHasNoBranchPredictor]>;
 
 def : ProcNoItin<"cortex-a32",                           [ARMv8a,
                                                          FeatureHWDivThumb,
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e0810c358f2d..1ec6b24b2ed6 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1851,9 +1851,9 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
 }
 
 bool ARMBaseInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &,
+isProfitableToIfCvt(MachineBasicBlock &TBB,
                     unsigned TCycles, unsigned TExtra,
-                    MachineBasicBlock &,
+                    MachineBasicBlock &FBB,
                     unsigned FCycles, unsigned FExtra,
                     BranchProbability Probability) const {
   if (!TCycles)
@@ -1863,14 +1863,43 @@ isProfitableToIfCvt(MachineBasicBlock &,
   // Here we scale up each component of UnpredCost to avoid precision issue when
   // scaling TCycles/FCycles by Probability.
   const unsigned ScalingUpFactor = 1024;
-  unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
-  unsigned FUnpredCost =
+
+  unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor;
+  unsigned UnpredCost;
+  if (!Subtarget.hasBranchPredictor()) {
+    // When we don't have a branch predictor it's always cheaper to not take a
+    // branch than take it, so we have to take that into account.
+    unsigned NotTakenBranchCost = 1;
+    unsigned TakenBranchCost = Subtarget.getMispredictionPenalty();
+    unsigned TUnpredCycles, FUnpredCycles;
+    if (!FCycles) {
+      // Triangle: TBB is the fallthrough
+      TUnpredCycles = TCycles + NotTakenBranchCost;
+      FUnpredCycles = TakenBranchCost;
+    } else {
+      // Diamond: TBB is the block that is branched to, FBB is the fallthrough
+      TUnpredCycles = TCycles + TakenBranchCost;
+      FUnpredCycles = FCycles + NotTakenBranchCost;
+    }
+    // The total cost is the cost of each path scaled by their probabilites
+    unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor);
+    unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor);
+    UnpredCost = TUnpredCost + FUnpredCost;
+    // When predicating assume that the first IT can be folded away but later
+    // ones cost one cycle each
+    if (Subtarget.isThumb2() && TCycles + FCycles > 4) {
+      PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor;
+    }
+  } else {
+    unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
+    unsigned FUnpredCost =
       Probability.getCompl().scale(FCycles * ScalingUpFactor);
-  unsigned UnpredCost = TUnpredCost + FUnpredCost;
-  UnpredCost += 1 * ScalingUpFactor; // The branch itself
-  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+    UnpredCost = TUnpredCost + FUnpredCost;
+    UnpredCost += 1 * ScalingUpFactor; // The branch itself
+    UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+  }
 
-  return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
+  return PredCost <= UnpredCost;
 }
 
 bool
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 2bcc707e9fc3..e42514acd76f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -7580,6 +7580,9 @@ static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
   SDValue VHi = DAG.getAnyExtOrTrunc(
       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
       dl, MVT::i32);
+  bool isBigEndian = DAG.getDataLayout().isBigEndian();
+  if (isBigEndian)
+    std::swap (VLo, VHi);
   SDValue RegClass =
       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
@@ -7607,10 +7610,14 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N,
   MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
 
-  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32,
-                                               SDValue(CmpSwap, 0)));
-  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32,
-                                               SDValue(CmpSwap, 0)));
+  bool isBigEndian = DAG.getDataLayout().isBigEndian();
+
+  Results.push_back(
+      DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
+                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
+  Results.push_back(
+      DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
+                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
   Results.push_back(SDValue(CmpSwap, 2));
 }
 
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 423f97ccacd6..891a8f482f0a 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1416,12 +1416,12 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
 let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1,
     isIndirectBranch = 1 in {
 def tTBB_JT : tPseudoInst<(outs),
-        (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
-        Sched<[WriteBr]>;
+        (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
+         IIC_Br, []>, Sched<[WriteBr]>;
 
 def tTBH_JT : tPseudoInst<(outs),
-        (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
-        Sched<[WriteBr]>;
+        (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
+         IIC_Br, []>,  Sched<[WriteBr]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 4cb0eca5ee5f..374176d1d737 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -46,6 +46,10 @@ private:
                   MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
                   const RegisterBankInfo &RBI) const;
 
+  bool selectSelect(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
+                    MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                    const RegisterBankInfo &RBI) const;
+
   const ARMBaseInstrInfo &TII;
   const ARMBaseRegisterInfo &TRI;
   const ARMBaseTargetMachine &TM;
@@ -346,6 +350,50 @@ bool ARMInstructionSelector::selectICmp(MachineInstrBuilder &MIB,
   return true;
 }
 
+bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB,
+                                          const ARMBaseInstrInfo &TII,
+                                          MachineRegisterInfo &MRI,
+                                          const TargetRegisterInfo &TRI,
+                                          const RegisterBankInfo &RBI) const {
+  auto &MBB = *MIB->getParent();
+  auto InsertBefore = std::next(MIB->getIterator());
+  auto &DebugLoc = MIB->getDebugLoc();
+
+  // Compare the condition to 0.
+  auto CondReg = MIB->getOperand(1).getReg();
+  assert(MRI.getType(CondReg).getSizeInBits() == 1 &&
+         RBI.getRegBank(CondReg, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported types for select operation");
+  auto CmpI = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::CMPri))
+                  .addUse(CondReg)
+                  .addImm(0)
+                  .add(predOps(ARMCC::AL));
+  if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI))
+    return false;
+
+  // Move a value into the result register based on the result of the
+  // comparison.
+  auto ResReg = MIB->getOperand(0).getReg();
+  auto TrueReg = MIB->getOperand(2).getReg();
+  auto FalseReg = MIB->getOperand(3).getReg();
+  assert(MRI.getType(ResReg) == MRI.getType(TrueReg) &&
+         MRI.getType(TrueReg) == MRI.getType(FalseReg) &&
+         MRI.getType(FalseReg).getSizeInBits() == 32 &&
+         RBI.getRegBank(TrueReg, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         RBI.getRegBank(FalseReg, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported types for select operation");
+  auto Mov1I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVCCr))
+                   .addDef(ResReg)
+                   .addUse(TrueReg)
+                   .addUse(FalseReg)
+                   .add(predOps(ARMCC::EQ, ARM::CPSR));
+  if (!constrainSelectedInstRegOperands(*Mov1I, TII, TRI, RBI))
+    return false;
+
+  MIB->eraseFromParent();
+  return true;
+}
+
 bool ARMInstructionSelector::select(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -448,6 +496,8 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
   }
   case G_ICMP:
     return selectICmp(MIB, TII, MRI, TRI, RBI);
+  case G_SELECT:
+    return selectSelect(MIB, TII, MRI, TRI, RBI);
   case G_GEP:
     I.setDesc(TII.get(ARM::ADDrr));
     MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 5873c7fb3872..f3e62d09cc30 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -55,10 +55,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
 
   for (unsigned Op : {G_SDIV, G_UDIV}) {
     for (auto Ty : {s8, s16})
-      // FIXME: We need WidenScalar here, but in the case of targets with
-      // software division we'll also need Libcall afterwards. Treat as Custom
-      // until we have better support for chaining legalization actions.
-      setAction({Op, Ty}, Custom);
+      setAction({Op, Ty}, WidenScalar);
     if (ST.hasDivideInARMMode())
       setAction({Op, s32}, Legal);
     else
@@ -84,6 +81,10 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   setAction({G_GEP, p0}, Legal);
   setAction({G_GEP, 1, s32}, Legal);
 
+  setAction({G_SELECT, s32}, Legal);
+  setAction({G_SELECT, p0}, Legal);
+  setAction({G_SELECT, 1, s1}, Legal);
+
   setAction({G_CONSTANT, s32}, Legal);
 
   setAction({G_ICMP, s1}, Legal);
@@ -118,40 +119,6 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
   switch (MI.getOpcode()) {
   default:
     return false;
-  case G_SDIV:
-  case G_UDIV: {
-    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    if (Ty != LLT::scalar(16) && Ty != LLT::scalar(8))
-      return false;
-
-    // We need to widen to 32 bits and then maybe, if the target requires,
-    // transform into a libcall.
-    LegalizerHelper Helper(MIRBuilder.getMF());
-
-    MachineInstr *NewMI = nullptr;
-    Helper.MIRBuilder.recordInsertions([&](MachineInstr *MI) {
-      // Store the new, 32-bit div instruction.
-      if (MI->getOpcode() == G_SDIV || MI->getOpcode() == G_UDIV)
-        NewMI = MI;
-    });
-
-    auto Result = Helper.widenScalar(MI, 0, LLT::scalar(32));
-    Helper.MIRBuilder.stopRecordingInsertions();
-    if (Result == LegalizerHelper::UnableToLegalize) {
-      return false;
-    }
-    assert(NewMI && "Couldn't find widened instruction");
-    assert((NewMI->getOpcode() == G_SDIV || NewMI->getOpcode() == G_UDIV) &&
-           "Unexpected widened instruction");
-    assert(MRI.getType(NewMI->getOperand(0).getReg()).getSizeInBits() == 32 &&
-           "Unexpected type for the widened instruction");
-
-    Result = Helper.legalizeInstrStep(*NewMI);
-    if (Result == LegalizerHelper::UnableToLegalize) {
-      return false;
-    }
-    return true;
-  }
   case G_SREM:
   case G_UREM: {
     unsigned OriginalResult = MI.getOperand(0).getReg();
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 2350d0c6ef69..11fb81a4f9fe 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -255,6 +255,18 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OperandsMapping =
         getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
     break;
+  case G_SELECT: {
+    LLT Ty2 = MRI.getType(MI.getOperand(1).getReg());
+    (void)Ty2;
+    assert(Ty.getSizeInBits() == 32 && "Unsupported size for G_SELECT");
+    assert(Ty2.getSizeInBits() == 1 && "Unsupported size for G_SELECT");
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx]});
+    break;
+  }
   case G_ICMP: {
     LLT Ty2 = MRI.getType(MI.getOperand(2).getReg());
     (void)Ty2;
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 02cbfb1fa9f1..b10583bc7983 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -245,6 +245,10 @@ def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
 // the general GPR register class above (MOV, e.g.)
 def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>;
 
+// Thumb registers R0-R7 and the PC. Some instructions like TBB or THH allow
+// the PC to be used as a destination operand as well.
+def tGPRwithpc : RegisterClass<"ARM", [i32], 32, (add tGPR, PC)>;
+
 // The high registers in thumb mode, R8-R15.
 def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>;
 
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 1c7902520f2d..53e012f13ee2 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -424,3 +424,4 @@ include "ARMScheduleA9.td"
 include "ARMScheduleSwift.td"
 include "ARMScheduleR52.td"
 include "ARMScheduleA57.td"
+include "ARMScheduleM3.td"
diff --git a/lib/Target/ARM/ARMScheduleM3.td b/lib/Target/ARM/ARMScheduleM3.td
new file mode 100644
index 000000000000..93f8299f9bd0
--- /dev/null
+++ b/lib/Target/ARM/ARMScheduleM3.td
@@ -0,0 +1,21 @@
+//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM Cortex-M3 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM3Model : SchedMachineModel {
+  let IssueWidth        = 1; // Only IT can be dual-issued, so assume single-issue
+  let MicroOpBufferSize = 0; // In-order
+  let LoadLatency       = 2; // Latency when not pipelined, not pc-relative
+  let MispredictPenalty = 2; // Best case branch taken cost
+
+  let CompleteModel = 0;
+}
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index d9d0c27c6304..2c42a1336166 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -11,6 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARM.h"
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "ARMCallLowering.h"
+#include "ARMLegalizerInfo.h"
+#include "ARMRegisterBankInfo.h"
+#endif
 #include "ARMSubtarget.h"
 #include "ARMFrameLowering.h"
 #include "ARMInstrInfo.h"
@@ -23,6 +30,13 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#endif
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -78,11 +92,6 @@ ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
   return *this;
 }
 
-/// EnableExecuteOnly - Enables the generation of execute-only code on supported
-/// targets
-static cl::opt<bool>
-EnableExecuteOnly("arm-execute-only");
-
 ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
                                                         StringRef FS) {
   ARMSubtarget &STI = initializeSubtargetDependencies(CPU, FS);
@@ -92,13 +101,41 @@ ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
   return new ARMFrameLowering(STI);
 }
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct ARMGISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+
+} // end anonymous namespace
+#endif
+
 ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS,
                            const ARMBaseTargetMachine &TM, bool IsLittle)
     : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps),
-      GenExecuteOnly(EnableExecuteOnly), CPUString(CPU), IsLittle(IsLittle),
-      TargetTriple(TT), Options(TM.Options), TM(TM),
-      FrameLowering(initializeFrameLowering(CPU, FS)),
+      CPUString(CPU), IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options),
+      TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)),
       // At this point initializeSubtargetDependencies has been called so
       // we can query directly.
       InstrInfo(isThumb1Only()
@@ -106,7 +143,29 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                     : !isThumb()
                           ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this)
                           : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)),
-      TLInfo(TM, *this) {}
+      TLInfo(TM, *this) {
+  assert((isThumb() || hasARMOps()) &&
+         "Target must either be thumb or support ARM operations!");
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+  GISelAccessor *GISel = new GISelAccessor();
+#else
+  ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor();
+  GISel->CallLoweringInfo.reset(new ARMCallLowering(*getTargetLowering()));
+  GISel->Legalizer.reset(new ARMLegalizerInfo(*this));
+
+  auto *RBI = new ARMRegisterBankInfo(*getRegisterInfo());
+
+  // FIXME: At this point, we can't rely on Subtarget having RBI.
+  // It's awkward to mix passing RBI and the Subtarget; should we pass
+  // TII/TRI as well?
+  GISel->InstSelector.reset(createARMInstructionSelector(
+      *static_cast<const ARMBaseTargetMachine *>(&TM), *this, *RBI));
+
+  GISel->RegBankInfo.reset(RBI);
+#endif
+  setGISelAccessor(*GISel);
+}
 
 const CallLowering *ARMSubtarget::getCallLowering() const {
   assert(GISel && "Access to GlobalISel APIs not set");
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index d890d0fa777e..e15b17512c96 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -246,6 +246,11 @@ protected:
   /// avoid issue "normal" call instructions to callees which do not return.
   bool HasRetAddrStack = false;
 
+  /// HasBranchPredictor - True if the subtarget has a branch predictor. Having
+  /// a branch predictor or not changes the expected cost of taking a branch
+  /// which affects the choice of whether to use predicated instructions.
+  bool HasBranchPredictor = true;
+
   /// HasMPExtension - True if the subtarget supports Multiprocessing
   /// extension (ARMv7 only).
   bool HasMPExtension = false;
@@ -554,6 +559,7 @@ public:
   bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
   bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
   bool hasRetAddrStack() const { return HasRetAddrStack; }
+  bool hasBranchPredictor() const { return HasBranchPredictor; }
   bool hasMPExtension() const { return HasMPExtension; }
   bool hasDSP() const { return HasDSP; }
   bool useNaClTrap() const { return UseNaClTrap; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index eb71e557ec91..c323a1d368de 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -11,11 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMCallLowering.h"
-#include "ARMLegalizerInfo.h"
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-#include "ARMRegisterBankInfo.h"
-#endif
 #include "ARMSubtarget.h"
 #include "ARMMacroFusion.h"
 #include "ARMTargetMachine.h"
@@ -29,7 +24,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/ExecutionDepsFix.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
@@ -110,60 +104,20 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
 static ARMBaseTargetMachine::ARMABI
 computeTargetABI(const Triple &TT, StringRef CPU,
                  const TargetOptions &Options) {
-  if (Options.MCOptions.getABIName() == "aapcs16")
+  StringRef ABIName = Options.MCOptions.getABIName();
+
+  if (ABIName.empty())
+    ABIName = ARM::computeDefaultTargetABI(TT, CPU);
+
+  if (ABIName == "aapcs16")
     return ARMBaseTargetMachine::ARM_ABI_AAPCS16;
-  else if (Options.MCOptions.getABIName().startswith("aapcs"))
+  else if (ABIName.startswith("aapcs"))
     return ARMBaseTargetMachine::ARM_ABI_AAPCS;
-  else if (Options.MCOptions.getABIName().startswith("apcs"))
+  else if (ABIName.startswith("apcs"))
     return ARMBaseTargetMachine::ARM_ABI_APCS;
 
-  assert(Options.MCOptions.getABIName().empty() &&
-         "Unknown target-abi option!");
-
-  ARMBaseTargetMachine::ARMABI TargetABI =
-      ARMBaseTargetMachine::ARM_ABI_UNKNOWN;
-
-  unsigned ArchKind = ARM::parseCPUArch(CPU);
-  StringRef ArchName = ARM::getArchName(ArchKind);
-  // FIXME: This is duplicated code from the front end and should be unified.
-  if (TT.isOSBinFormatMachO()) {
-    if (TT.getEnvironment() == Triple::EABI ||
-        (TT.getOS() == Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
-        ARM::parseArchProfile(ArchName) == ARM::PK_M) {
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-    } else if (TT.isWatchABI()) {
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16;
-    } else {
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
-    }
-  } else if (TT.isOSWindows()) {
-    // FIXME: this is invalid for WindowsCE
-    TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-  } else {
-    // Select the default based on the platform.
-    switch (TT.getEnvironment()) {
-    case Triple::Android:
-    case Triple::GNUEABI:
-    case Triple::GNUEABIHF:
-    case Triple::MuslEABI:
-    case Triple::MuslEABIHF:
-    case Triple::EABIHF:
-    case Triple::EABI:
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-      break;
-    case Triple::GNU:
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
-      break;
-    default:
-      if (TT.isOSNetBSD())
-        TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
-      else
-        TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-      break;
-    }
-  }
-
-  return TargetABI;
+  llvm_unreachable("Unhandled/unknown ABI Name!");
+  return ARMBaseTargetMachine::ARM_ABI_UNKNOWN;
 }
 
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
@@ -248,61 +202,39 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
                         CPU, FS, Options, getEffectiveRelocModel(TT, RM), CM,
                         OL),
       TargetABI(computeTargetABI(TT, CPU, Options)),
-      TLOF(createTLOF(getTargetTriple())),
-      Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) {
+      TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) {
 
   // Default to triple-appropriate float ABI
-  if (Options.FloatABIType == FloatABI::Default)
-    this->Options.FloatABIType =
-        Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft;
+  if (Options.FloatABIType == FloatABI::Default) {
+    if (TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+        TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
+        TargetTriple.getEnvironment() == Triple::EABIHF ||
+        TargetTriple.isOSWindows() ||
+        TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16)
+      this->Options.FloatABIType = FloatABI::Hard;
+    else
+      this->Options.FloatABIType = FloatABI::Soft;
+  }
 
   // Default to triple-appropriate EABI
   if (Options.EABIVersion == EABI::Default ||
       Options.EABIVersion == EABI::Unknown) {
     // musl is compatible with glibc with regard to EABI version
-    if (Subtarget.isTargetGNUAEABI() || Subtarget.isTargetMuslAEABI())
+    if ((TargetTriple.getEnvironment() == Triple::GNUEABI ||
+	 TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+	 TargetTriple.getEnvironment() == Triple::MuslEABI ||
+	 TargetTriple.getEnvironment() == Triple::MuslEABIHF) &&
+	!(TargetTriple.isOSWindows() || TargetTriple.isOSDarwin()))
       this->Options.EABIVersion = EABI::GNU;
     else
       this->Options.EABIVersion = EABI::EABI5;
   }
 
   initAsmInfo();
-  if (!Subtarget.isThumb() && !Subtarget.hasARMOps())
-    report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
-                       "support ARM mode execution!");
 }
 
 ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
 
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
-
-struct ARMGISelActualAccessor : public GISelAccessor {
-  std::unique_ptr<CallLowering> CallLoweringInfo;
-  std::unique_ptr<InstructionSelector> InstSelector;
-  std::unique_ptr<LegalizerInfo> Legalizer;
-  std::unique_ptr<RegisterBankInfo> RegBankInfo;
-
-  const CallLowering *getCallLowering() const override {
-    return CallLoweringInfo.get();
-  }
-
-  const InstructionSelector *getInstructionSelector() const override {
-    return InstSelector.get();
-  }
-
-  const LegalizerInfo *getLegalizerInfo() const override {
-    return Legalizer.get();
-  }
-
-  const RegisterBankInfo *getRegBankInfo() const override {
-    return RegBankInfo.get();
-  }
-};
-
-} // end anonymous namespace
-#endif
-
 const ARMSubtarget *
 ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -334,24 +266,6 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle);
-
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-    GISelAccessor *GISel = new GISelAccessor();
-#else
-    ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor();
-    GISel->CallLoweringInfo.reset(new ARMCallLowering(*I->getTargetLowering()));
-    GISel->Legalizer.reset(new ARMLegalizerInfo(*I));
-
-    auto *RBI = new ARMRegisterBankInfo(*I->getRegisterInfo());
-
-    // FIXME: At this point, we can't rely on Subtarget having RBI.
-    // It's awkward to mix passing RBI and the Subtarget; should we pass
-    // TII/TRI as well?
-    GISel->InstSelector.reset(createARMInstructionSelector(*this, *I, *RBI));
-
-    GISel->RegBankInfo.reset(RBI);
-#endif
-    I->setGISelAccessor(*GISel);
   }
   return I.get();
 }
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 2fcee73228fe..f41da3e8e223 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -36,7 +36,6 @@ public:
 
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  ARMSubtarget Subtarget;
   bool isLittle;
   mutable StringMap<std::unique_ptr<ARMSubtarget>> SubtargetMap;
 
@@ -47,8 +46,8 @@ public:
                        CodeGenOpt::Level OL, bool isLittle);
   ~ARMBaseTargetMachine() override;
 
-  const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; }
   const ARMSubtarget *getSubtargetImpl(const Function &F) const override;
+  const ARMSubtarget *getSubtargetImpl() const = delete;
   bool isLittleEndian() const { return isLittle; }
 
   /// \brief Get the TargetIRAnalysis for this target.
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index a5b27abeb27f..88bab64ffaf2 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -32,7 +32,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                         const TargetMachine &TM) {
   const ARMBaseTargetMachine &ARM_TM = static_cast<const ARMBaseTargetMachine &>(TM);
   bool isAAPCS_ABI = ARM_TM.TargetABI == ARMBaseTargetMachine::ARMABI::ARM_ABI_AAPCS;
-  genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly();
+  //  genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly();
 
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(isAAPCS_ABI);
@@ -43,16 +43,6 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
 
   AttributesSection =
       getContext().getELFSection(".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
-
-  // Make code section unreadable when in execute-only mode
-  if (genExecuteOnly) {
-    unsigned  Type = ELF::SHT_PROGBITS;
-    unsigned Flags = ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_ARM_PURECODE;
-    // Since we cannot modify flags for an existing section, we create a new
-    // section with the right flags, and use 0 as the unique ID for
-    // execute-only text
-    TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U);
-  }
 }
 
 const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
@@ -74,21 +64,27 @@ getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
                                  getContext());
 }
 
-MCSection *
-ARMElfTargetObjectFile::getExplicitSectionGlobal(const GlobalObject *GO,
-                                                 SectionKind SK, const TargetMachine &TM) const {
+static bool isExecuteOnlyFunction(const GlobalObject *GO, SectionKind SK,
+                                  const TargetMachine &TM) {
+  if (const Function *F = dyn_cast<Function>(GO))
+    if (TM.getSubtarget<ARMSubtarget>(*F).genExecuteOnly() && SK.isText())
+      return true;
+  return false;
+}
+
+MCSection *ARMElfTargetObjectFile::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const {
   // Set execute-only access for the explicit section
-  if (genExecuteOnly && SK.isText())
+  if (isExecuteOnlyFunction(GO, SK, TM))
     SK = SectionKind::getExecuteOnly();
 
   return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM);
 }
 
-MCSection *
-ARMElfTargetObjectFile::SelectSectionForGlobal(const GlobalObject *GO,
-                                               SectionKind SK, const TargetMachine &TM) const {
+MCSection *ARMElfTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const {
   // Place the global in the execute-only text section
-  if (genExecuteOnly && SK.isText())
+  if (isExecuteOnlyFunction(GO, SK, TM))
     SK = SectionKind::getExecuteOnly();
 
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, SK, TM);
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index dbb8128269dc..bd7aa1cfe02b 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -16,8 +16,6 @@
 namespace llvm {
 
 class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
-  mutable bool genExecuteOnly = false;
-
 protected:
   const MCSection *AttributesSection = nullptr;
 
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 585726208a8d..5ab236b7fd4c 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -486,7 +486,7 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
-  Size = 0;
+  Size = 4;
   return MCDisassembler::Fail;
 }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 81760f03940a..22de728fe06e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -738,13 +738,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
   }
 }
 
-void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                      const MCFixup &Fixup,
-                                      const MCValue &Target, bool &IsResolved) {
+bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                          const MCFixup &Fixup,
+                                          const MCValue &Target) {
   const MCSymbolRefExpr *A = Target.getSymA();
   const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
   const unsigned FixupKind = Fixup.getKind() ;
-  if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
+  if ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
     assert(Sym && "How did we resolve this?");
 
     // If the symbol is external the linker will handle it.
@@ -753,7 +753,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
     // If the symbol is out of range, produce a relocation and hope the
     // linker can handle it. GNU AS produces an error in this case.
     if (Sym->isExternal())
-      IsResolved = false;
+      return true;
   }
   // Create relocations for unconditional branches to function symbols with
   // different execution mode in ELF binaries.
@@ -761,12 +761,12 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
     unsigned Type = dyn_cast<MCSymbolELF>(Sym)->getType();
     if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)) {
       if (Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_uncondbranch))
-        IsResolved = false;
+        return true;
       if (!Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_thumb_br ||
                                     FixupKind == ARM::fixup_arm_thumb_bl ||
                                     FixupKind == ARM::fixup_t2_condbranch ||
                                     FixupKind == ARM::fixup_t2_uncondbranch))
-        IsResolved = false;
+        return true;
     }
   }
   // We must always generate a relocation for BL/BLX instructions if we have
@@ -776,7 +776,8 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
             FixupKind == ARM::fixup_arm_blx ||
             FixupKind == ARM::fixup_arm_uncondbl ||
             FixupKind == ARM::fixup_arm_condbl))
-    IsResolved = false;
+    return true;
+  return false;
 }
 
 /// getFixupKindNumBytes - The number of bytes the fixup may change.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 6a0ba2ed41c1..84b54bbb9a49 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -38,10 +38,8 @@ public:
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
-  /// processFixupValue - Target hook to process the literal value of a fixup
-  /// if necessary.
-  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
-                         const MCValue &Target, bool &IsResolved) override;
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
 
   unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
                             const MCValue &Target, uint64_t Value, bool IsPCRel,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
index 9f6c5d7bf920..831589ba0581 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -15,55 +15,47 @@
 namespace llvm {
 namespace ARM {
 enum Fixups {
-  // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol
-  // addresses
+  // 12-bit PC relative relocation for symbol addresses
   fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind,
 
-  // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with
-  // the 16-bit halfwords reordered.
+  // Equivalent to fixup_arm_ldst_pcrel_12, with the 16-bit halfwords reordered.
   fixup_t2_ldst_pcrel_12,
 
-  // fixup_arm_pcrel_10_unscaled - 10-bit PC relative relocation for symbol
-  // addresses used in LDRD/LDRH/LDRB/etc. instructions. All bits are encoded.
+  // 10-bit PC relative relocation for symbol addresses used in
+  // LDRD/LDRH/LDRB/etc. instructions. All bits are encoded.
   fixup_arm_pcrel_10_unscaled,
-  // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses
-  // used in VFP instructions where the lower 2 bits are not encoded
-  // (so it's encoded as an 8-bit immediate).
+  // 10-bit PC relative relocation for symbol addresses used in VFP instructions
+  // where the lower 2 bits are not encoded (so it's encoded as an 8-bit
+  // immediate).
   fixup_arm_pcrel_10,
-  // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
-  // the short-swapped encoding of Thumb2 instructions.
+  // Equivalent to fixup_arm_pcrel_10, accounting for the short-swapped encoding
+  // of Thumb2 instructions.
   fixup_t2_pcrel_10,
-  // fixup_arm_pcrel_9 - 9-bit PC relative relocation for symbol addresses
-  // used in VFP instructions where bit 0 not encoded (so it's encoded as an
-  // 8-bit immediate).
+  // 9-bit PC relative relocation for symbol addresses used in VFP instructions
+  // where bit 0 not encoded (so it's encoded as an 8-bit immediate).
   fixup_arm_pcrel_9,
-  // fixup_t2_pcrel_9 - Equivalent to fixup_arm_pcrel_9, accounting for
-  // the short-swapped encoding of Thumb2 instructions.
+  // Equivalent to fixup_arm_pcrel_9, accounting for the short-swapped encoding
+  // of Thumb2 instructions.
   fixup_t2_pcrel_9,
-  // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
-  // addresses where the lower 2 bits are not encoded (so it's encoded as an
-  // 8-bit immediate).
+  // 10-bit PC relative relocation for symbol addresses where the lower 2 bits
+  // are not encoded (so it's encoded as an 8-bit immediate).
   fixup_thumb_adr_pcrel_10,
-  // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
-  // instruction.
+  // 12-bit PC relative relocation for the ADR instruction.
   fixup_arm_adr_pcrel_12,
-  // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
-  // instruction.
+  // 12-bit PC relative relocation for the ADR instruction.
   fixup_t2_adr_pcrel_12,
-  // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch
-  // instructions. 
+  // 24-bit PC relative relocation for conditional branch instructions.
   fixup_arm_condbranch,
-  // fixup_arm_uncondbranch - 24-bit PC relative relocation for 
-  // branch instructions. (unconditional)
+  // 24-bit PC relative relocation for branch instructions. (unconditional)
   fixup_arm_uncondbranch,
-  // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct
-  // uconditional branch instructions.
+  // 20-bit PC relative relocation for Thumb2 direct uconditional branch
+  // instructions.
   fixup_t2_condbranch,
-  // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct
-  // branch unconditional branch instructions.
+  // 20-bit PC relative relocation for Thumb2 direct branch unconditional branch
+  // instructions.
   fixup_t2_uncondbranch,
 
-  // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
+  // 12-bit fixup for Thumb B instructions.
   fixup_arm_thumb_br,
 
   // The following fixups handle the ARM BL instructions. These can be
@@ -75,42 +67,41 @@ enum Fixups {
   // MachO does not draw a distinction between the two cases, so it will treat
   // fixup_arm_uncondbl and fixup_arm_condbl as identical fixups.
 
-  // fixup_arm_uncondbl - Fixup for unconditional ARM BL instructions.
+  // Fixup for unconditional ARM BL instructions.
   fixup_arm_uncondbl,
 
-  // fixup_arm_condbl - Fixup for ARM BL instructions with nontrivial
-  // conditionalisation.
+  // Fixup for ARM BL instructions with nontrivial conditionalisation.
   fixup_arm_condbl,
 
-  // fixup_arm_blx - Fixup for ARM BLX instructions.
+  // Fixup for ARM BLX instructions.
   fixup_arm_blx,
 
-  // fixup_arm_thumb_bl - Fixup for Thumb BL instructions.
+  // Fixup for Thumb BL instructions.
   fixup_arm_thumb_bl,
 
-  // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
+  // Fixup for Thumb BLX instructions.
   fixup_arm_thumb_blx,
 
-  // fixup_arm_thumb_cb - Fixup for Thumb branch instructions.
+  // Fixup for Thumb branch instructions.
   fixup_arm_thumb_cb,
 
-  // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs.
+  // Fixup for Thumb load/store from constant pool instrs.
   fixup_arm_thumb_cp,
 
-  // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions.
+  // Fixup for Thumb conditional branching instructions.
   fixup_arm_thumb_bcc,
 
   // The next two are for the movt/movw pair
   // the 16bit imm field are split into imm{15-12} and imm{11-0}
   fixup_arm_movt_hi16, // :upper16:
   fixup_arm_movw_lo16, // :lower16:
-  fixup_t2_movt_hi16, // :upper16:
-  fixup_t2_movw_lo16, // :lower16:
+  fixup_t2_movt_hi16,  // :upper16:
+  fixup_t2_movw_lo16,  // :lower16:
 
-  // fixup_arm_mod_imm - Fixup for mod_imm
+  // Fixup for mod_imm
   fixup_arm_mod_imm,
 
-  // fixup_t2_so_imm - Fixup for Thumb2 8-bit rotated operand
+  // Fixup for Thumb2 8-bit rotated operand
   fixup_t2_so_imm,
 
   // Marker
@@ -118,6 +109,6 @@ enum Fixups {
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
 };
 }
-}
+} // namespace llvm
 
 #endif
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 5c3b45ac2328..d18298385adf 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -230,13 +230,25 @@ void ms8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 namespace llvm {
 
 // Prepare value for the target space for it
-void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t &Value,
+void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup,
+                                     const MCValue &Target,
+                                     uint64_t &Value,
                                      MCContext *Ctx) const {
   // The size of the fixup in bits.
   uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize;
 
   unsigned Kind = Fixup.getKind();
 
+  // Parsed LLVM-generated temporary labels are already
+  // adjusted for instruction size, but normal labels aren't.
+  //
+  // To handle both cases, we simply un-adjust the temporary label
+  // case so it acts like all other labels.
+  if (const MCSymbolRefExpr *A = Target.getSymA()) {
+    if (A->getSymbol().isTemporary())
+      Value += 2;
+  }
+
   switch (Kind) {
   default:
     llvm_unreachable("unhandled fixup");
@@ -333,9 +345,10 @@ MCObjectWriter *AVRAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
                                   MCELFObjectTargetWriter::getOSABI(OSType));
 }
 
-void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value,
-                               bool IsPCRel, MCContext &Ctx) const {
+void AVRAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target, MutableArrayRef<char> Data,
+                               uint64_t Value, bool IsPCRel) const {
+  adjustFixupValue(Fixup, Target, Value, &Asm.getContext());
   if (Value == 0)
     return; // Doesn't change encoding.
 
@@ -349,7 +362,7 @@ void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
@@ -436,30 +449,16 @@ bool AVRAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void AVRAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                      const MCAsmLayout &Layout,
-                                      const MCFixup &Fixup,
-                                      const MCFragment *DF,
-                                      const MCValue &Target, uint64_t &Value,
-                                      bool &IsResolved) {
+bool AVRAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                          const MCFixup &Fixup,
+                                          const MCValue &Target) {
   switch ((unsigned) Fixup.getKind()) {
+  default: return false;
   // Fixups which should always be recorded as relocations.
   case AVR::fixup_7_pcrel:
   case AVR::fixup_13_pcrel:
   case AVR::fixup_call:
-    IsResolved = false;
-    break;
-  default:
-    // Parsed LLVM-generated temporary labels are already
-    // adjusted for instruction size, but normal labels aren't.
-    //
-    // To handle both cases, we simply un-adjust the temporary label
-    // case so it acts like all other labels.
-    if (Target.getSymA()->getSymbol().isTemporary())
-      Value += 2;
-
-    adjustFixupValue(Fixup, Value, &Asm.getContext());
-    break;
+    return true;
   }
 }
 
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index f2be2494684a..4a75e3b0d22d 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -35,13 +35,14 @@ public:
   AVRAsmBackend(Triple::OSType OSType)
       : MCAsmBackend(), OSType(OSType) {}
 
-  void adjustFixupValue(const MCFixup &Fixup, uint64_t &Value,
-                        MCContext *Ctx = nullptr) const;
+  void adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
+                        uint64_t &Value, MCContext *Ctx = nullptr) const;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
@@ -63,10 +64,8 @@ public:
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
 
 private:
   Triple::OSType OSType;
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index c6ddd6bdad5e..f48429ee57b0 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -16,6 +16,7 @@
 #include "BPFRegisterInfo.h"
 #include "BPFSubtarget.h"
 #include "BPFTargetMachine.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -57,6 +58,11 @@ private:
   bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
 
+  // Node preprocessing cases
+  void PreprocessLoad(SDNode *Node, SelectionDAG::allnodes_iterator I);
+  void PreprocessCopyToReg(SDNode *Node);
+  void PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator I);
+
   // Find constants from a constant structure
   typedef std::vector<unsigned char> val_vec_type;
   bool fillGenericConstant(const DataLayout &DL, const Constant *CV,
@@ -69,9 +75,12 @@ private:
                           val_vec_type &Vals, int Offset);
   bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset,
                              uint64_t Size, unsigned char *ByteSeq);
+  bool checkLoadDef(unsigned DefReg, unsigned match_load_op);
 
   // Mapping from ConstantStruct global value to corresponding byte-list values
   std::map<const void *, val_vec_type> cs_vals_;
+  // Mapping from vreg to load memory opcode
+  std::map<unsigned, unsigned> load_to_vreg_;
 };
 } // namespace
 
@@ -203,89 +212,110 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
   SelectCode(Node);
 }
 
+void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
+                                     SelectionDAG::allnodes_iterator I) {
+  union {
+    uint8_t c[8];
+    uint16_t s;
+    uint32_t i;
+    uint64_t d;
+  } new_val; // hold up the constant values replacing loads.
+  bool to_replace = false;
+  SDLoc DL(Node);
+  const LoadSDNode *LD = cast<LoadSDNode>(Node);
+  uint64_t size = LD->getMemOperand()->getSize();
+
+  if (!size || size > 8 || (size & (size - 1)))
+    return;
+
+  SDNode *LDAddrNode = LD->getOperand(1).getNode();
+  // Match LDAddr against either global_addr or (global_addr + offset)
+  unsigned opcode = LDAddrNode->getOpcode();
+  if (opcode == ISD::ADD) {
+    SDValue OP1 = LDAddrNode->getOperand(0);
+    SDValue OP2 = LDAddrNode->getOperand(1);
+
+    // We want to find the pattern global_addr + offset
+    SDNode *OP1N = OP1.getNode();
+    if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END || OP1N->getNumOperands() == 0)
+      return;
+
+    DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+
+    const GlobalAddressSDNode *GADN =
+        dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode());
+    const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode());
+    if (GADN && CDN)
+      to_replace =
+          getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
+  } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
+             LDAddrNode->getNumOperands() > 0) {
+    DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+
+    SDValue OP1 = LDAddrNode->getOperand(0);
+    if (const GlobalAddressSDNode *GADN =
+            dyn_cast<GlobalAddressSDNode>(OP1.getNode()))
+      to_replace = getConstantFieldValue(GADN, 0, size, new_val.c);
+  }
+
+  if (!to_replace)
+    return;
+
+  // replacing the old with a new value
+  uint64_t val;
+  if (size == 1)
+    val = new_val.c[0];
+  else if (size == 2)
+    val = new_val.s;
+  else if (size == 4)
+    val = new_val.i;
+  else {
+    val = new_val.d;
+  }
+
+  DEBUG(dbgs() << "Replacing load of size " << size << " with constant " << val
+               << '\n');
+  SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
+
+  // After replacement, the current node is dead, we need to
+  // go backward one step to make iterator still work
+  I--;
+  SDValue From[] = {SDValue(Node, 0), SDValue(Node, 1)};
+  SDValue To[] = {NVal, NVal};
+  CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2);
+  I++;
+  // It is safe to delete node now
+  CurDAG->DeleteNode(Node);
+}
+
 void BPFDAGToDAGISel::PreprocessISelDAG() {
-  // Iterate through all nodes, only interested in loads from ConstantStruct
-  // ConstantArray should have converted by IR->DAG processing
+  // Iterate through all nodes, interested in the following cases:
+  //
+  //  . loads from ConstantStruct or ConstantArray of constructs
+  //    which can be turns into constant itself, with this we can
+  //    avoid reading from read-only section at runtime.
+  //
+  //  . reg truncating is often the result of 8/16/32bit->64bit or
+  //    8/16bit->32bit conversion. If the reg value is loaded with
+  //    masked byte width, the AND operation can be removed since
+  //    BPF LOAD already has zero extension.
+  //
+  //    This also solved a correctness issue.
+  //    In BPF socket-related program, e.g., __sk_buff->{data, data_end}
+  //    are 32-bit registers, but later on, kernel verifier will rewrite
+  //    it with 64-bit value. Therefore, truncating the value after the
+  //    load will result in incorrect code.
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
                                        E = CurDAG->allnodes_end();
        I != E;) {
     SDNode *Node = &*I++;
     unsigned Opcode = Node->getOpcode();
-    if (Opcode != ISD::LOAD)
-      continue;
-
-    union {
-      uint8_t c[8];
-      uint16_t s;
-      uint32_t i;
-      uint64_t d;
-    } new_val; // hold up the constant values replacing loads.
-    bool to_replace = false;
-    SDLoc DL(Node);
-    const LoadSDNode *LD = cast<LoadSDNode>(Node);
-    uint64_t size = LD->getMemOperand()->getSize();
-    if (!size || size > 8 || (size & (size - 1)))
-      continue;
-
-    SDNode *LDAddrNode = LD->getOperand(1).getNode();
-    // Match LDAddr against either global_addr or (global_addr + offset)
-    unsigned opcode = LDAddrNode->getOpcode();
-    if (opcode == ISD::ADD) {
-      SDValue OP1 = LDAddrNode->getOperand(0);
-      SDValue OP2 = LDAddrNode->getOperand(1);
-
-      // We want to find the pattern global_addr + offset
-      SDNode *OP1N = OP1.getNode();
-      if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END ||
-          OP1N->getNumOperands() == 0)
-        continue;
-
-      DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
-
-      const GlobalAddressSDNode *GADN =
-          dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode());
-      const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode());
-      if (GADN && CDN)
-        to_replace =
-            getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
-    } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
-               LDAddrNode->getNumOperands() > 0) {
-      DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
-
-      SDValue OP1 = LDAddrNode->getOperand(0);
-      if (const GlobalAddressSDNode *GADN =
-              dyn_cast<GlobalAddressSDNode>(OP1.getNode()))
-        to_replace = getConstantFieldValue(GADN, 0, size, new_val.c);
-    }
-
-    if (!to_replace)
-      continue;
-
-    // replacing the old with a new value
-    uint64_t val;
-    if (size == 1)
-      val = new_val.c[0];
-    else if (size == 2)
-      val = new_val.s;
-    else if (size == 4)
-      val = new_val.i;
-    else {
-      val = new_val.d;
-    }
-
-    DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
-                 << val << '\n');
-    SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
-
-    // After replacement, the current node is dead, we need to
-    // go backward one step to make iterator still work
-    I--;
-    SDValue From[] = {SDValue(Node, 0), SDValue(Node, 1)};
-    SDValue To[] = {NVal, NVal};
-    CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2);
-    I++;
-    // It is safe to delete node now
-    CurDAG->DeleteNode(Node);
+    if (Opcode == ISD::LOAD)
+      PreprocessLoad(Node, I);
+    else if (Opcode == ISD::CopyToReg)
+      PreprocessCopyToReg(Node);
+    else if (Opcode == ISD::AND)
+      PreprocessTrunc(Node, I);
   }
 }
 
@@ -415,6 +445,134 @@ bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL,
   return true;
 }
 
+void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) {
+  const RegisterSDNode *RegN = dyn_cast<RegisterSDNode>(Node->getOperand(1));
+  if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
+    return;
+
+  const LoadSDNode *LD = dyn_cast<LoadSDNode>(Node->getOperand(2));
+  if (!LD)
+    return;
+
+  // Assign a load value to a virtual register. record its load width
+  unsigned mem_load_op = 0;
+  switch (LD->getMemOperand()->getSize()) {
+  default:
+    return;
+  case 4:
+    mem_load_op = BPF::LDW;
+    break;
+  case 2:
+    mem_load_op = BPF::LDH;
+    break;
+  case 1:
+    mem_load_op = BPF::LDB;
+    break;
+  }
+
+  DEBUG(dbgs() << "Find Load Value to VReg "
+               << TargetRegisterInfo::virtReg2Index(RegN->getReg()) << '\n');
+  load_to_vreg_[RegN->getReg()] = mem_load_op;
+}
+
+void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
+                                      SelectionDAG::allnodes_iterator I) {
+  ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+  if (!MaskN)
+    return;
+
+  unsigned match_load_op = 0;
+  switch (MaskN->getZExtValue()) {
+  default:
+    return;
+  case 0xFFFFFFFF:
+    match_load_op = BPF::LDW;
+    break;
+  case 0xFFFF:
+    match_load_op = BPF::LDH;
+    break;
+  case 0xFF:
+    match_load_op = BPF::LDB;
+    break;
+  }
+
+  // The Reg operand should be a virtual register, which is defined
+  // outside the current basic block. DAG combiner has done a pretty
+  // good job in removing truncating inside a single basic block.
+  SDValue BaseV = Node->getOperand(0);
+  if (BaseV.getOpcode() != ISD::CopyFromReg)
+    return;
+
+  const RegisterSDNode *RegN =
+      dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1));
+  if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
+    return;
+  unsigned AndOpReg = RegN->getReg();
+  DEBUG(dbgs() << "Examine %vreg" << TargetRegisterInfo::virtReg2Index(AndOpReg)
+               << '\n');
+
+  // Examine the PHI insns in the MachineBasicBlock to found out the
+  // definitions of this virtual register. At this stage (DAG2DAG
+  // transformation), only PHI machine insns are available in the machine basic
+  // block.
+  MachineBasicBlock *MBB = FuncInfo->MBB;
+  MachineInstr *MII = nullptr;
+  for (auto &MI : *MBB) {
+    for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+      const MachineOperand &MOP = MI.getOperand(i);
+      if (!MOP.isReg() || !MOP.isDef())
+        continue;
+      unsigned Reg = MOP.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg) && Reg == AndOpReg) {
+        MII = &MI;
+        break;
+      }
+    }
+  }
+
+  if (MII == nullptr) {
+    // No phi definition in this block.
+    if (!checkLoadDef(AndOpReg, match_load_op))
+      return;
+  } else {
+    // The PHI node looks like:
+    //   %vreg2<def> = PHI %vreg0, <BB#1>, %vreg1, <BB#3>
+    // Trace each incoming definition, e.g., (%vreg0, BB#1) and (%vreg1, BB#3)
+    // The AND operation can be removed if both %vreg0 in BB#1 and %vreg1 in
+    // BB#3 are defined with with a load matching the MaskN.
+    DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
+    unsigned PrevReg = -1;
+    for (unsigned i = 0; i < MII->getNumOperands(); ++i) {
+      const MachineOperand &MOP = MII->getOperand(i);
+      if (MOP.isReg()) {
+        if (MOP.isDef())
+          continue;
+        PrevReg = MOP.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(PrevReg))
+          return;
+        if (!checkLoadDef(PrevReg, match_load_op))
+          return;
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
+        dbgs() << '\n');
+
+  I--;
+  CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
+  I++;
+  CurDAG->DeleteNode(Node);
+}
+
+bool BPFDAGToDAGISel::checkLoadDef(unsigned DefReg, unsigned match_load_op) {
+  auto it = load_to_vreg_.find(DefReg);
+  if (it == load_to_vreg_.end())
+    return false; // The definition of register is not exported yet.
+
+  return it->second == match_load_op;
+}
+
 FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
   return new BPFDAGToDAGISel(TM);
 }
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 2b0ceaa66258..97a53dcbaed7 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -178,8 +178,8 @@ static cl::opt<bool> EnableSaveRestoreLong("enable-save-restore-long",
     cl::Hidden, cl::desc("Enable long calls for save-restore stubs."),
     cl::init(false), cl::ZeroOrMore);
 
-static cl::opt<bool> UseAllocframe("use-allocframe", cl::init(true),
-    cl::Hidden, cl::desc("Use allocframe more conservatively"));
+static cl::opt<bool> EliminateFramePointer("hexagon-fp-elim", cl::init(true),
+    cl::Hidden, cl::desc("Refrain from using FP whenever possible"));
 
 static cl::opt<bool> OptimizeSpillSlots("hexagon-opt-spill", cl::Hidden,
     cl::init(true), cl::desc("Optimize spill slots"));
@@ -550,7 +550,6 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &HII = *HST.getInstrInfo();
   auto &HRI = *HST.getRegisterInfo();
-  DebugLoc dl;
 
   unsigned MaxAlign = std::max(MFI.getMaxAlignment(), getStackAlignment());
 
@@ -584,77 +583,56 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
     MI->eraseFromParent();
   }
 
-  if (!hasFP(MF))
-    return;
-
-  // Check for overflow.
-  // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
-  const unsigned int ALLOCFRAME_MAX = 16384;
+  DebugLoc dl = MBB.findDebugLoc(InsertPt);
 
-  // Create a dummy memory operand to avoid allocframe from being treated as
-  // a volatile memory reference.
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore,
-                            4, 4);
-
-  if (NumBytes >= ALLOCFRAME_MAX) {
-    // Emit allocframe(#0).
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
-      .addImm(0)
-      .addMemOperand(MMO);
-
-    // Subtract offset from frame pointer.
-    // We use a caller-saved non-parameter register for that.
-    unsigned CallerSavedReg = HRI.getFirstCallerSavedNonParamReg();
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::CONST32),
-            CallerSavedReg).addImm(NumBytes);
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_sub), SP)
+  if (hasFP(MF)) {
+    insertAllocframe(MBB, InsertPt, NumBytes);
+    if (AlignStack) {
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_andir), SP)
+          .addReg(SP)
+          .addImm(-int64_t(MaxAlign));
+    }
+    // If the stack-checking is enabled, and we spilled the callee-saved
+    // registers inline (i.e. did not use a spill function), then call
+    // the stack checker directly.
+    if (EnableStackOVFSanitizer && !PrologueStubs)
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_call_stk))
+             .addExternalSymbol("__runtime_stack_check");
+  } else if (NumBytes > 0) {
+    assert(alignTo(NumBytes, 8) == NumBytes);
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
       .addReg(SP)
-      .addReg(CallerSavedReg);
-  } else {
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
-      .addImm(NumBytes)
-      .addMemOperand(MMO);
+      .addImm(-int(NumBytes));
   }
-
-  if (AlignStack) {
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_andir), SP)
-        .addReg(SP)
-        .addImm(-int64_t(MaxAlign));
-  }
-
-  // If the stack-checking is enabled, and we spilled the callee-saved
-  // registers inline (i.e. did not use a spill function), then call
-  // the stack checker directly.
-  if (EnableStackOVFSanitizer && !PrologueStubs)
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_call_stk))
-           .addExternalSymbol("__runtime_stack_check");
 }
 
 void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
   MachineFunction &MF = *MBB.getParent();
-  if (!hasFP(MF))
-    return;
-
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &HII = *HST.getInstrInfo();
   auto &HRI = *HST.getRegisterInfo();
   unsigned SP = HRI.getStackRegister();
 
+  MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
+  DebugLoc dl = MBB.findDebugLoc(InsertPt);
+
+  if (!hasFP(MF)) {
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (unsigned NumBytes = MFI.getStackSize()) {
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
+        .addReg(SP)
+        .addImm(NumBytes);
+    }
+    return;
+  }
+
   MachineInstr *RetI = getReturn(MBB);
   unsigned RetOpc = RetI ? RetI->getOpcode() : 0;
 
-  MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
-  DebugLoc DL;
-  if (InsertPt != MBB.end())
-    DL = InsertPt->getDebugLoc();
-  else if (!MBB.empty())
-    DL = std::prev(MBB.end())->getDebugLoc();
-
   // Handle EH_RETURN.
   if (RetOpc == Hexagon::EH_RETURN_JMPR) {
-    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe));
-    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::A2_add), SP)
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe));
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_add), SP)
         .addReg(SP)
         .addReg(Hexagon::R28);
     return;
@@ -699,16 +677,52 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
   // otherwise just add deallocframe. The function could be returning via a
   // tail call.
   if (RetOpc != Hexagon::PS_jmpret || DisableDeallocRet) {
-    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe));
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe));
     return;
   }
   unsigned NewOpc = Hexagon::L4_return;
-  MachineInstr *NewI = BuildMI(MBB, RetI, DL, HII.get(NewOpc));
+  MachineInstr *NewI = BuildMI(MBB, RetI, dl, HII.get(NewOpc));
   // Transfer the function live-out registers.
   NewI->copyImplicitOps(MF, *RetI);
   MBB.erase(RetI);
 }
 
+void HexagonFrameLowering::insertAllocframe(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator InsertPt, unsigned NumBytes) const {
+  MachineFunction &MF = *MBB.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  auto &HRI = *HST.getRegisterInfo();
+
+  // Check for overflow.
+  // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
+  const unsigned int ALLOCFRAME_MAX = 16384;
+
+  // Create a dummy memory operand to avoid allocframe from being treated as
+  // a volatile memory reference.
+  auto *MMO = MF.getMachineMemOperand(MachinePointerInfo::getStack(MF, 0),
+                                      MachineMemOperand::MOStore, 4, 4);
+
+  DebugLoc dl = MBB.findDebugLoc(InsertPt);
+
+  if (NumBytes >= ALLOCFRAME_MAX) {
+    // Emit allocframe(#0).
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
+      .addImm(0)
+      .addMemOperand(MMO);
+
+    // Subtract the size from the stack pointer.
+    unsigned SP = HRI.getStackRegister();
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
+      .addReg(SP)
+      .addImm(-int(NumBytes));
+  } else {
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
+      .addImm(NumBytes)
+      .addMemOperand(MMO);
+  }
+}
+
 void HexagonFrameLowering::updateEntryPaths(MachineFunction &MF,
       MachineBasicBlock &SaveB) const {
   SetVector<unsigned> Worklist;
@@ -928,12 +942,11 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
 }
 
 bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
+  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+    return false;
+
   auto &MFI = MF.getFrameInfo();
   auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
-
-  bool HasFixed = MFI.getNumFixedObjects();
-  bool HasPrealloc = const_cast<MachineFrameInfo&>(MFI)
-                        .getLocalFrameObjectCount();
   bool HasExtraAlign = HRI.needsStackRealignment(MF);
   bool HasAlloca = MFI.hasVarSizedObjects();
 
@@ -947,18 +960,35 @@ bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
 
   // By default we want to use SP (since it's always there). FP requires
   // some setup (i.e. ALLOCFRAME).
-  // Fixed and preallocated objects need FP if the distance from them to
-  // the SP is unknown (as is with alloca or aligna).
-  if ((HasFixed || HasPrealloc) && (HasAlloca || HasExtraAlign))
+  // Both, alloca and stack alignment modify the stack pointer by an
+  // undetermined value, so we need to save it at the entry to the function
+  // (i.e. use allocframe).
+  if (HasAlloca || HasExtraAlign)
     return true;
 
   if (MFI.getStackSize() > 0) {
-    if (EnableStackOVFSanitizer || UseAllocframe)
+    // If FP-elimination is disabled, we have to use FP at this point.
+    const TargetMachine &TM = MF.getTarget();
+    if (TM.Options.DisableFramePointerElim(MF) || !EliminateFramePointer)
+      return true;
+    if (EnableStackOVFSanitizer)
       return true;
   }
 
-  if (MFI.hasCalls() ||
-      MF.getInfo<HexagonMachineFunctionInfo>()->hasClobberLR())
+  const auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  if (MFI.hasCalls() || HMFI.hasClobberLR())
+    return true;
+
+  // Frame pointer elimination is a possiblility at this point, but
+  // to know if FP is necessary we need to know if spill/restore
+  // functions will be used (they require FP to be valid).
+  // This means that hasFP shouldn't really be called before CSI is
+  // calculated, and some measures are taken to make sure of that
+  // (e.g. default implementations of virtual functions that call it
+  // are overridden apropriately).
+  assert(MFI.isCalleeSavedInfoValid() && "Need to know CSI");
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  if (useSpillFunction(MF, CSI) || useRestoreFunction(MF, CSI))
     return true;
 
   return false;
@@ -1051,9 +1081,10 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   bool HasExtraAlign = HRI.needsStackRealignment(MF);
   bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
 
-  unsigned FrameSize = MFI.getStackSize();
-  unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister();
   auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  unsigned FrameSize = MFI.getStackSize();
+  unsigned SP = HRI.getStackRegister();
+  unsigned FP = HRI.getFrameRegister();
   unsigned AP = HMFI.getStackAlignBasePhysReg();
   // It may happen that AP will be absent even HasAlloca && HasExtraAlign
   // is true. HasExtraAlign may be set because of vector spills, without
@@ -1135,7 +1166,7 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   // there will be no SP -= FrameSize), so the frame size should not be
   // added to the calculated offset.
   int RealOffset = Offset;
-  if (!UseFP && !UseAP && HasFP)
+  if (!UseFP && !UseAP)
     RealOffset = FrameSize+Offset;
   return RealOffset;
 }
@@ -2402,7 +2433,7 @@ void HexagonFrameLowering::addCalleeSaveRegistersAsImpOperand(MachineInstr *MI,
 /// be generated via inline code. If this function returns "true", inline
 /// code will be generated. If this function returns "false", additional
 /// checks are performed, which may still lead to the inline code.
-bool HexagonFrameLowering::shouldInlineCSR(MachineFunction &MF,
+bool HexagonFrameLowering::shouldInlineCSR(const MachineFunction &MF,
       const CSIVect &CSI) const {
   if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn())
     return true;
@@ -2432,7 +2463,7 @@ bool HexagonFrameLowering::shouldInlineCSR(MachineFunction &MF,
   return false;
 }
 
-bool HexagonFrameLowering::useSpillFunction(MachineFunction &MF,
+bool HexagonFrameLowering::useSpillFunction(const MachineFunction &MF,
       const CSIVect &CSI) const {
   if (shouldInlineCSR(MF, CSI))
     return false;
@@ -2445,7 +2476,7 @@ bool HexagonFrameLowering::useSpillFunction(MachineFunction &MF,
   return Threshold < NumCSI;
 }
 
-bool HexagonFrameLowering::useRestoreFunction(MachineFunction &MF,
+bool HexagonFrameLowering::useRestoreFunction(const MachineFunction &MF,
       const CSIVect &CSI) const {
   if (shouldInlineCSR(MF, CSI))
     return false;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 529a61d4a5b5..f4d4e1b61a26 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -48,6 +48,15 @@ public:
     return true;
   }
 
+  bool hasReservedCallFrame(const MachineFunction &MF) const override {
+    // We always reserve call frame as a part of the initial stack allocation.
+    return true;
+  }
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override {
+    // Override this function to avoid calling hasFP before CSI is set
+    // (the default implementation calls hasFP).
+    return true;
+  }
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
@@ -94,6 +103,8 @@ private:
       unsigned SP, unsigned CF) const;
   void insertPrologueInBlock(MachineBasicBlock &MBB, bool PrologueStubs) const;
   void insertEpilogueInBlock(MachineBasicBlock &MBB) const;
+  void insertAllocframe(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator InsertPt, unsigned NumBytes) const;
   bool insertCSRSpillsInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
       const HexagonRegisterInfo &HRI, bool &PrologueStubs) const;
   bool insertCSRRestoresInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
@@ -148,9 +159,9 @@ private:
 
   void addCalleeSaveRegistersAsImpOperand(MachineInstr *MI, const CSIVect &CSI,
       bool IsDef, bool IsKill) const;
-  bool shouldInlineCSR(MachineFunction &MF, const CSIVect &CSI) const;
-  bool useSpillFunction(MachineFunction &MF, const CSIVect &CSI) const;
-  bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const;
+  bool shouldInlineCSR(const MachineFunction &MF, const CSIVect &CSI) const;
+  bool useSpillFunction(const MachineFunction &MF, const CSIVect &CSI) const;
+  bool useRestoreFunction(const MachineFunction &MF, const CSIVect &CSI) const;
   bool mayOverflowFrameOffset(MachineFunction &MF) const;
 };
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index afed894cfb9a..2daacf795555 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1002,51 +1002,46 @@ bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
 
 SDValue
 HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
   MachineFunction &MF = DAG.getMachineFunction();
-  auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
-  switch (Node->getOpcode()) {
-    case ISD::INLINEASM: {
-      unsigned NumOps = Node->getNumOperands();
-      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
-        --NumOps;  // Ignore the flag operand.
-
-      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-        if (FuncInfo.hasClobberLR())
-          break;
-        unsigned Flags =
-          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
-        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
-        ++i;  // Skip the ID value.
-
-        switch (InlineAsm::getKind(Flags)) {
-        default: llvm_unreachable("Bad flags!");
-          case InlineAsm::Kind_RegDef:
-          case InlineAsm::Kind_RegUse:
-          case InlineAsm::Kind_Imm:
-          case InlineAsm::Kind_Clobber:
-          case InlineAsm::Kind_Mem: {
-            for (; NumVals; --NumVals, ++i) {}
-            break;
-          }
-          case InlineAsm::Kind_RegDefEarlyClobber: {
-            for (; NumVals; --NumVals, ++i) {
-              unsigned Reg =
-                cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-
-              // Check it to be lr
-              const HexagonRegisterInfo *QRI = Subtarget.getRegisterInfo();
-              if (Reg == QRI->getRARegister()) {
-                FuncInfo.setHasClobberLR(true);
-                break;
-              }
-            }
-            break;
-          }
+  auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
+  unsigned LR = HRI.getRARegister();
+
+  if (Op.getOpcode() != ISD::INLINEASM || HMFI.hasClobberLR())
+    return Op;
+
+  unsigned NumOps = Op.getNumOperands();
+  if (Op.getOperand(NumOps-1).getValueType() == MVT::Glue)
+    --NumOps;  // Ignore the flag operand.
+
+  for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+    unsigned Flags = cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue();
+    unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+    ++i;  // Skip the ID value.
+
+    switch (InlineAsm::getKind(Flags)) {
+      default:
+        llvm_unreachable("Bad flags!");
+      case InlineAsm::Kind_RegUse:
+      case InlineAsm::Kind_Imm:
+      case InlineAsm::Kind_Mem:
+        i += NumVals;
+        break;
+      case InlineAsm::Kind_Clobber:
+      case InlineAsm::Kind_RegDef:
+      case InlineAsm::Kind_RegDefEarlyClobber: {
+        for (; NumVals; --NumVals, ++i) {
+          unsigned Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg();
+          if (Reg != LR)
+            continue;
+          HMFI.setHasClobberLR(true);
+          return Op;
         }
+        break;
       }
     }
-  } // Node->getOpcode
+  }
+
   return Op;
 }
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index fec2dc5ce306..1eac2d3dd8e2 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1253,10 +1253,16 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       LivePhysRegs LiveAtMI(HRI);
       getLiveRegsAt(LiveAtMI, MI);
       bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+      unsigned PReg = Op1.getReg();
+      assert(Op1.getSubReg() == 0);
+      unsigned PState = getRegState(Op1);
+
       if (Op0.getReg() != Op2.getReg()) {
+        unsigned S = Op0.getReg() != Op3.getReg() ? PState & ~RegState::Kill
+                                                  : PState;
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vcmov))
                      .add(Op0)
-                     .add(Op1)
+                     .addReg(PReg, S)
                      .add(Op2);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
@@ -1265,7 +1271,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       if (Op0.getReg() != Op3.getReg()) {
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vncmov))
                      .add(Op0)
-                     .add(Op1)
+                     .addReg(PReg, PState)
                      .add(Op3);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
@@ -1282,12 +1288,18 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       LivePhysRegs LiveAtMI(HRI);
       getLiveRegsAt(LiveAtMI, MI);
       bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+      unsigned PReg = Op1.getReg();
+      assert(Op1.getSubReg() == 0);
+      unsigned PState = getRegState(Op1);
 
       if (Op0.getReg() != Op2.getReg()) {
+        unsigned S = Op0.getReg() != Op3.getReg() ? PState & ~RegState::Kill
+                                                  : PState;
         unsigned SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo);
         unsigned SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi);
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine))
                      .add(Op0)
+                     .addReg(PReg, S)
                      .add(Op1)
                      .addReg(SrcHi)
                      .addReg(SrcLo);
@@ -1300,7 +1312,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         unsigned SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi);
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vnccombine))
                      .add(Op0)
-                     .add(Op1)
+                     .addReg(PReg, PState)
                      .addReg(SrcHi)
                      .addReg(SrcLo);
         if (IsDestLive)
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index de6b203015d8..e93f075f4ccd 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -69,9 +69,7 @@ namespace {
   public:
     static char ID;
 
-    HexagonNewValueJump() : MachineFunctionPass(ID) {
-      initializeHexagonNewValueJumpPass(*PassRegistry::getPassRegistry());
-    }
+    HexagonNewValueJump() : MachineFunctionPass(ID) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBranchProbabilityInfo>();
@@ -445,8 +443,6 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
     unsigned predReg = 0; // predicate reg of the jump.
     unsigned cmpReg1 = 0;
     int cmpOp2 = 0;
-    bool MO1IsKill = false;
-    bool MO2IsKill = false;
     MachineBasicBlock::iterator jmpPos;
     MachineBasicBlock::iterator cmpPos;
     MachineInstr *cmpInstr = nullptr, *jmpInstr = nullptr;
@@ -548,14 +544,10 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
           // We need cmpReg1 and cmpOp2(imm or reg) while building
           // new value jump instruction.
           cmpReg1 = MI.getOperand(1).getReg();
-          if (MI.getOperand(1).isKill())
-            MO1IsKill = true;
 
-          if (isSecondOpReg) {
+          if (isSecondOpReg)
             cmpOp2 = MI.getOperand(2).getReg();
-            if (MI.getOperand(2).isKill())
-              MO2IsKill = true;
-          } else
+          else
             cmpOp2 = MI.getOperand(2).getImm();
           continue;
         }
@@ -605,11 +597,8 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
             if ((COp == Hexagon::C2_cmpeq || COp == Hexagon::C4_cmpneq) &&
                 (feederReg == (unsigned) cmpOp2)) {
               unsigned tmp = cmpReg1;
-              bool tmpIsKill = MO1IsKill;
               cmpReg1 = cmpOp2;
-              MO1IsKill = MO2IsKill;
               cmpOp2 = tmp;
-              MO2IsKill = tmpIsKill;
             }
 
             // Now we have swapped the operands, all we need to check is,
@@ -623,31 +612,33 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
           // make sure we are respecting the kill values of
           // the operands of the feeder.
 
-          bool updatedIsKill = false;
-          for (unsigned i = 0; i < MI.getNumOperands(); i++) {
-            MachineOperand &MO = MI.getOperand(i);
-            if (MO.isReg() && MO.isUse()) {
-              unsigned feederReg = MO.getReg();
-              for (MachineBasicBlock::iterator localII = feederPos,
-                   end = cmpInstr->getIterator(); localII != end; localII++) {
-                MachineInstr &localMI = *localII;
-                for (unsigned j = 0; j < localMI.getNumOperands(); j++) {
-                  MachineOperand &localMO = localMI.getOperand(j);
-                  if (localMO.isReg() && localMO.isUse() &&
-                      localMO.isKill() && feederReg == localMO.getReg()) {
-                    // We found that there is kill of a use register
-                    // Set up a kill flag on the register
-                    localMO.setIsKill(false);
-                    MO.setIsKill();
-                    updatedIsKill = true;
-                    break;
-                  }
+          auto TransferKills = [jmpPos,cmpPos] (MachineInstr &MI) {
+            for (MachineOperand &MO : MI.operands()) {
+              if (!MO.isReg() || !MO.isUse())
+                continue;
+              unsigned UseR = MO.getReg();
+              for (auto I = std::next(MI.getIterator()); I != jmpPos; ++I) {
+                if (I == cmpPos)
+                  continue;
+                for (MachineOperand &Op : I->operands()) {
+                  if (!Op.isReg() || !Op.isUse() || !Op.isKill())
+                    continue;
+                  if (Op.getReg() != UseR)
+                    continue;
+                  // We found that there is kill of a use register
+                  // Set up a kill flag on the register
+                  Op.setIsKill(false);
+                  MO.setIsKill(true);
+                  return;
                 }
-                if (updatedIsKill) break;
               }
             }
-            if (updatedIsKill) break;
-          }
+          };
+
+          TransferKills(*feederPos);
+          TransferKills(*cmpPos);
+          bool MO1IsKill = cmpPos->killsRegister(cmpReg1, QRI);
+          bool MO2IsKill = isSecondOpReg && cmpPos->killsRegister(cmpOp2, QRI);
 
           MBB->splice(jmpPos, MI.getParent(), MI);
           MBB->splice(jmpPos, MI.getParent(), cmpInstr);
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 27b40f134b1f..a331c978f59d 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -535,9 +535,9 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
         !MI->getOperand(1).isGlobal())
       continue;
 
-    DEBUG(dbgs() << "[Analyzing A2_tfrsi]: " << *MI << "\n");
-    DEBUG(dbgs() << "\t[InstrNode]: " << Print<NodeAddr<InstrNode *>>(IA, *DFG)
-                 << "\n");
+    DEBUG(dbgs() << "[Analyzing " << HII->getName(MI->getOpcode()) << "]: "
+                 << *MI << "\n\t[InstrNode]: "
+                 << Print<NodeAddr<InstrNode *>>(IA, *DFG) << '\n');
 
     NodeList UNodeList;
     getAllRealUses(SA, UNodeList);
@@ -605,7 +605,9 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
   const TargetOperandInfo TOI(*HII);
 
   DataFlowGraph G(MF, *HII, TRI, *MDT, MDF, TOI);
-  G.build();
+  // Need to keep dead phis because we can propagate uses of registers into
+  // nodes dominated by those would-be phis.
+  G.build(BuildOptions::KeepDeadPhis);
   DFG = &G;
 
   Liveness L(MRI, *DFG);
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 031a1bdefafb..76d9b31b005f 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -113,6 +113,7 @@ namespace llvm {
   void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
   void initializeHexagonGenMuxPass(PassRegistry&);
   void initializeHexagonOptAddrModePass(PassRegistry&);
+  void initializeHexagonNewValueJumpPass(PassRegistry&);
   Pass *createHexagonLoopIdiomPass();
 
   FunctionPass *createHexagonBitSimplify();
@@ -158,6 +159,7 @@ extern "C" void LLVMInitializeHexagonTarget() {
   initializeHexagonLoopIdiomRecognizePass(PR);
   initializeHexagonGenMuxPass(PR);
   initializeHexagonOptAddrModePass(PR);
+  initializeHexagonNewValueJumpPass(PR);
 }
 
 HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 4dacb1501392..34df2ebcc520 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -49,6 +49,10 @@ static cl::opt<bool> TraceGVPlacement("trace-gv-placement",
   cl::Hidden, cl::init(false),
   cl::desc("Trace global value placement"));
 
+static cl::opt<bool>
+    EmitJtInText("hexagon-emit-jt-text", cl::Hidden, cl::init(false),
+                 cl::desc("Emit hexagon jump tables in function section"));
+
 // TraceGVPlacement controls messages for all builds. For builds with assertions
 // (debug or release), messages are also controlled by the usual debug flags
 // (e.g. -debug and -debug-only=globallayout)
@@ -256,6 +260,11 @@ unsigned HexagonTargetObjectFile::getSmallDataSize() const {
   return SmallDataThreshold;
 }
 
+bool HexagonTargetObjectFile::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  return EmitJtInText;
+}
+
 /// Descends any type down to "elementary" components,
 /// discovering the smallest addressable one.
 /// If zero is returned, declaration will not be modified.
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index 58dff2b95e19..373d850b53be 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -33,6 +33,9 @@ namespace llvm {
 
     unsigned getSmallDataSize() const;
 
+    bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
+                                             const Function &F) const override;
+
   private:
     MCSectionELF *SmallDataSection;
     MCSectionELF *SmallBSSSection;
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index d578bfab3658..aac810e29fe9 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -21,6 +21,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hexagontti"
 
+static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
+  cl::init(true), cl::Hidden,
+  cl::desc("Control lookup table emission on Hexagon target"));
+
 TargetTransformInfo::PopcntSupportKind
 HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
   // Return Fast Hardware support as every input  < 64 bits will be promoted
@@ -29,7 +33,7 @@ HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
 }
 
 // The Hexagon target can unroll loops with run-time trip counts.
-void HexagonTTIImpl::getUnrollingPreferences(Loop *L,
+void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP) {
   UP.Runtime = UP.Partial = true;
 }
@@ -46,8 +50,9 @@ unsigned HexagonTTIImpl::getCacheLineSize() const {
   return getST()->getL1CacheLineSize();
 }
 
-int HexagonTTIImpl::getUserCost(const User *U) {
-  auto isCastFoldedIntoLoad = [] (const CastInst *CI) -> bool {
+int HexagonTTIImpl::getUserCost(const User *U,
+                                ArrayRef<const Value *> Operands) {
+  auto isCastFoldedIntoLoad = [](const CastInst *CI) -> bool {
     if (!CI->isIntegerCast())
       return false;
     const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0));
@@ -67,5 +72,9 @@ int HexagonTTIImpl::getUserCost(const User *U) {
   if (const CastInst *CI = dyn_cast<const CastInst>(U))
     if (isCastFoldedIntoLoad(CI))
       return TargetTransformInfo::TCC_Free;
-  return BaseT::getUserCost(U);
+  return BaseT::getUserCost(U, Operands);
+}
+
+bool HexagonTTIImpl::shouldBuildLookupTables() const {
+   return EmitLookupTables;
 }
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 8414bfc4e197..ab5a6e07d873 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -46,7 +46,8 @@ public:
   TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
 
   // The Hexagon target can unroll loops with run-time trip counts.
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   // L1 cache prefetch.
   unsigned getPrefetchDistance() const;
@@ -61,7 +62,10 @@ public:
 
   /// @}
 
-  int getUserCost(const User *U);
+  int getUserCost(const User *U, ArrayRef<const Value *> Operands);
+
+  // Hexagon specific decision to generate a lookup table.
+  bool shouldBuildLookupTables() const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 093ce80bc2e3..34d0b55aa22a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -199,11 +199,8 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  /// processFixupValue - Target hook to adjust the literal value of a fixup
-  /// if necessary. IsResolved signals whether the caller believes a relocation
-  /// is needed; the target can modify the value. The default does nothing.
-  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
-                         const MCValue &Target, bool &IsResolved) override {
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override {
     MCFixupKind Kind = Fixup.getKind();
 
     switch((unsigned)Kind) {
@@ -299,8 +296,7 @@ public:
       case fixup_Hexagon_LD_PLT_B22_PCREL_X:
       case fixup_Hexagon_LD_PLT_B32_PCREL_X:
         // These relocations should always have a relocation recorded
-        IsResolved = false;
-        return;
+        return true;
 
       case fixup_Hexagon_B22_PCREL:
         //IsResolved = false;
@@ -317,7 +313,7 @@ public:
       case fixup_Hexagon_B7_PCREL:
       case fixup_Hexagon_B7_PCREL_X:
         if (DisableFixup)
-          IsResolved = false;
+          return true;
         break;
 
       case FK_Data_1:
@@ -326,8 +322,9 @@ public:
       case FK_PCRel_4:
       case fixup_Hexagon_32:
         // Leave these relocations alone as they are used for EH.
-        return;
+        return false;
     }
+    return false;
   }
 
   /// getFixupKindNumBytes - The number of bytes the fixup may change.
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 9d5c179a0fd9..69b1ba1528d0 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -2789,6 +2789,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
                                             bool Is32BitSym, SMLoc IDLoc,
                                             MCStreamer &Out,
                                             const MCSubtargetInfo *STI) {
+  // FIXME: These expansions do not respect -mxgot.
   MipsTargetStreamer &TOut = getTargetStreamer();
   bool UseSrcReg = SrcReg != Mips::NoRegister;
   warnIfNoMacro(IDLoc);
@@ -2808,8 +2809,12 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     // symbol in the final relocation is external and not modified with a
     // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16.
     if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
-        Res.getConstant() == 0 && !Res.getSymA()->getSymbol().isInSection() &&
-        !Res.getSymA()->getSymbol().isTemporary()) {
+        Res.getConstant() == 0 &&
+        !(Res.getSymA()->getSymbol().isInSection() ||
+          Res.getSymA()->getSymbol().isTemporary() ||
+          (Res.getSymA()->getSymbol().isELF() &&
+           cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
+               ELF::STB_LOCAL))) {
       const MCExpr *CallExpr =
           MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
       TOut.emitRRX(Mips::LW, DstReg, ABI.GetGlobalPtr(),
@@ -2865,6 +2870,85 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     return false;
   }
 
+  if (inPicMode() && ABI.ArePtrs64bit()) {
+    MCValue Res;
+    if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
+      Error(IDLoc, "expected relocatable expression");
+      return true;
+    }
+    if (Res.getSymB() != nullptr) {
+      Error(IDLoc, "expected relocatable expression with only one symbol");
+      return true;
+    }
+
+    // The case where the result register is $25 is somewhat special. If the
+    // symbol in the final relocation is external and not modified with a
+    // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT_DISP.
+    if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
+        Res.getConstant() == 0 &&
+        !(Res.getSymA()->getSymbol().isInSection() ||
+          Res.getSymA()->getSymbol().isTemporary() ||
+          (Res.getSymA()->getSymbol().isELF() &&
+           cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
+               ELF::STB_LOCAL))) {
+      const MCExpr *CallExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
+      TOut.emitRRX(Mips::LD, DstReg, ABI.GetGlobalPtr(),
+                   MCOperand::createExpr(CallExpr), IDLoc, STI);
+      return false;
+    }
+
+    // The remaining cases are:
+    //   Small offset: ld $tmp, %got_disp(symbol)($gp)
+    //                >daddiu $tmp, $tmp, offset
+    //                >daddu $rd, $tmp, $rs
+    // The daddiu's marked with a '>' may be omitted if they are redundant. If
+    // this happens then the last instruction must use $rd as the result
+    // register.
+    const MipsMCExpr *GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP,
+                                                   Res.getSymA(),
+                                                   getContext());
+    const MCExpr *LoExpr = nullptr;
+    if (Res.getConstant() != 0) {
+      // Symbols fully resolve with just the %got_disp(symbol) but we
+      // must still account for any offset to the symbol for
+      // expressions like symbol+8.
+      LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
+
+      // FIXME: Offsets greater than 16 bits are not yet implemented.
+      // FIXME: The correct range is a 32-bit sign-extended number.
+      if (Res.getConstant() < -0x8000 || Res.getConstant() > 0x7fff) {
+        Error(IDLoc, "macro instruction uses large offset, which is not "
+                     "currently supported");
+        return true;
+      }
+    }
+
+    unsigned TmpReg = DstReg;
+    if (UseSrcReg &&
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
+                                                               SrcReg)) {
+      // If $rs is the same as $rd, we need to use AT.
+      // If it is not available we exit.
+      unsigned ATReg = getATReg(IDLoc);
+      if (!ATReg)
+        return true;
+      TmpReg = ATReg;
+    }
+
+    TOut.emitRRX(Mips::LD, TmpReg, ABI.GetGlobalPtr(),
+                 MCOperand::createExpr(GotExpr), IDLoc, STI);
+
+    if (LoExpr)
+      TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
+                   IDLoc, STI);
+
+    if (UseSrcReg)
+      TOut.emitRRR(Mips::DADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
+
+    return false;
+  }
+
   const MipsMCExpr *HiExpr =
       MipsMCExpr::create(MipsMCExpr::MEK_HI, SymExpr, getContext());
   const MipsMCExpr *LoExpr =
diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td
index 6b7f39e9dd79..38b09d105ddd 100644
--- a/lib/Target/Mips/MicroMips64r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -548,3 +548,15 @@ def : MipsInstAlias<"dnegu $rt, $rs",
 def : MipsInstAlias<"dnegu $rt",
                     (DSUBU_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>,
                     ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsll $rd, $rt, $rs",
+                    (DSLLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rt,
+                                  GPR32Opnd:$rs), 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsrl $rd, $rt, $rs",
+                    (DSRLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rt,
+                                  GPR32Opnd:$rs), 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsrl $rd, $rt",
+                    (DSRLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rd,
+                                  GPR32Opnd:$rt), 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsll $rd, $rt",
+                    (DSLLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rd,
+                                  GPR32Opnd:$rt), 0>, ISA_MICROMIPS64R6;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 99025fe1341d..3dba7ce30cad 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -748,9 +748,6 @@ let AdditionalPredicates = [NotInMicroMips] in {
   defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi64, GPR64Opnd, imm64>,
          GPR_64;
 }
-def : MipsInstAlias<"dsll $rd, $rt, $rs",
-                    (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
-                    ISA_MIPS3;
 let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"dneg $rt, $rs",
                       (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
@@ -793,9 +790,18 @@ def : MipsInstAlias<"dsra $rd, $rt, $rs",
                     (DSRAV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                     ISA_MIPS3;
 let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"dsll $rd, $rt, $rs",
+                      (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                      ISA_MIPS3;
   def : MipsInstAlias<"dsrl $rd, $rt, $rs",
                       (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                       ISA_MIPS3;
+  def : MipsInstAlias<"dsrl $rd, $rt",
+                      (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rd, GPR32Opnd:$rt), 0>,
+                      ISA_MIPS3;
+  def : MipsInstAlias<"dsll $rd, $rt",
+                      (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rd, GPR32Opnd:$rt), 0>,
+                      ISA_MIPS3;
 
 // Two operand (implicit 0 selector) versions:
   def : MipsInstAlias<"dmtc0 $rt, $rd",
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 5d82571ff94f..4a34e3101cb8 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -564,7 +564,7 @@ Iter Filler::replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch,
 
 // For given opcode returns opcode of corresponding instruction with short
 // delay slot.
-// For the pseudo TAILCALL*_MM instrunctions return the short delay slot
+// For the pseudo TAILCALL*_MM instructions return the short delay slot
 // form. Unfortunately, TAILCALL<->b16 is denied as b16 has a limited range
 // that is too short to make use of for tail calls.
 static int getEquivalentCallShort(int Opcode) {
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 02102d6b22f4..a6ec9fb2e598 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -364,18 +364,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::UDIV, MVT::i64, Expand);
   setOperationAction(ISD::UREM, MVT::i64, Expand);
 
-  if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) {
-    setOperationAction(ISD::ADDC, MVT::i32, Expand);
-    setOperationAction(ISD::ADDE, MVT::i32, Expand);
-  }
-
-  setOperationAction(ISD::ADDC, MVT::i64, Expand);
-  setOperationAction(ISD::ADDE, MVT::i64, Expand);
-  setOperationAction(ISD::SUBC, MVT::i32, Expand);
-  setOperationAction(ISD::SUBE, MVT::i32, Expand);
-  setOperationAction(ISD::SUBC, MVT::i64, Expand);
-  setOperationAction(ISD::SUBE, MVT::i64, Expand);
-
   // Operations not directly supported by Mips.
   setOperationAction(ISD::BR_CC,             MVT::f32,   Expand);
   setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
@@ -481,7 +469,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::AssertZext);
   setTargetDAGCombine(ISD::SHL);
 
@@ -936,130 +923,14 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   }
 }
 
-static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG,
-                                       const MipsSubtarget &Subtarget) {
-  // ROOTNode must have a multiplication as an operand for the match to be
-  // successful.
-  if (ROOTNode->getOperand(0).getOpcode() != ISD::MUL &&
-      ROOTNode->getOperand(1).getOpcode() != ISD::MUL)
-    return SDValue();
-
-  // We don't handle vector types here.
-  if (ROOTNode->getValueType(0).isVector())
-    return SDValue();
-
-  // For MIPS64, madd / msub instructions are inefficent to use with 64 bit
-  // arithmetic. E.g.
-  // (add (mul a b) c) =>
-  //   let res = (madd (mthi (drotr c 32))x(mtlo c) a b) in
-  //   MIPS64:   (or (dsll (mfhi res) 32) (dsrl (dsll (mflo res) 32) 32)
-  //   or
-  //   MIPS64R2: (dins (mflo res) (mfhi res) 32 32)
-  //
-  // The overhead of setting up the Hi/Lo registers and reassembling the
-  // result makes this a dubious optimzation for MIPS64. The core of the
-  // problem is that Hi/Lo contain the upper and lower 32 bits of the
-  // operand and result.
-  //
-  // It requires a chain of 4 add/mul for MIPS64R2 to get better code
-  // density than doing it naively, 5 for MIPS64. Additionally, using
-  // madd/msub on MIPS64 requires the operands actually be 32 bit sign
-  // extended operands, not true 64 bit values.
-  //
-  // FIXME: For the moment, disable this completely for MIPS64.
-  if (Subtarget.hasMips64())
-    return SDValue();
-
-  SDValue Mult = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
-                     ? ROOTNode->getOperand(0)
-                     : ROOTNode->getOperand(1);
-
-  SDValue AddOperand = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
-                     ? ROOTNode->getOperand(1)
-                     : ROOTNode->getOperand(0);
-
-  // Transform this to a MADD only if the user of this node is the add.
-  // If there are other users of the mul, this function returns here.
-  if (!Mult.hasOneUse())
-    return SDValue();
-
-  // maddu and madd are unusual instructions in that on MIPS64 bits 63..31
-  // must be in canonical form, i.e. sign extended. For MIPS32, the operands
-  // of the multiply must have 32 or more sign bits, otherwise we cannot
-  // perform this optimization. We have to check this here as we're performing
-  // this optimization pre-legalization.
-  SDValue MultLHS = Mult->getOperand(0);
-  SDValue MultRHS = Mult->getOperand(1);
-  unsigned LHSSB = CurDAG.ComputeNumSignBits(MultLHS);
-  unsigned RHSSB = CurDAG.ComputeNumSignBits(MultRHS);
-
-  if (LHSSB < 32 || RHSSB < 32)
-    return SDValue();
-
-  APInt HighMask =
-      APInt::getHighBitsSet(Mult->getValueType(0).getScalarSizeInBits(), 32);
-  bool IsUnsigned = CurDAG.MaskedValueIsZero(Mult->getOperand(0), HighMask) &&
-                    CurDAG.MaskedValueIsZero(Mult->getOperand(1), HighMask) &&
-                    CurDAG.MaskedValueIsZero(AddOperand, HighMask);
-
-  // Initialize accumulator.
-  SDLoc DL(ROOTNode);
-  SDValue TopHalf;
-  SDValue BottomHalf;
-  BottomHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
-                              CurDAG.getIntPtrConstant(0, DL));
-
-  TopHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
-                           CurDAG.getIntPtrConstant(1, DL));
-  SDValue ACCIn = CurDAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
-                                  BottomHalf,
-                                  TopHalf);
-
-  // Create MipsMAdd(u) / MipsMSub(u) node.
-  bool IsAdd = ROOTNode->getOpcode() == ISD::ADD;
-  unsigned Opcode = IsAdd ? (IsUnsigned ? MipsISD::MAddu : MipsISD::MAdd)
-                          : (IsUnsigned ? MipsISD::MSubu : MipsISD::MSub);
-  SDValue MAddOps[3] = {
-      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(0)),
-      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(1)), ACCIn};
-  EVT VTs[2] = {MVT::i32, MVT::i32};
-  SDValue MAdd = CurDAG.getNode(Opcode, DL, VTs, MAddOps);
-
-  SDValue ResLo = CurDAG.getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
-  SDValue ResHi = CurDAG.getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
-  SDValue Combined =
-      CurDAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi);
-  return Combined;
-}
-
-static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const MipsSubtarget &Subtarget) {
-  // (sub v0 (mul v1, v2)) => (msub v1, v2, v0)
-  if (DCI.isBeforeLegalizeOps()) {
-    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
-        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
-      return performMADD_MSUBCombine(N, DAG, Subtarget);
-
-    return SDValue();
-  }
-
-  return SDValue();
-}
-
 static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget &Subtarget) {
-  // (add v0 (mul v1, v2)) => (madd v1, v2, v0)
-  if (DCI.isBeforeLegalizeOps()) {
-    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
-        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
-      return performMADD_MSUBCombine(N, DAG, Subtarget);
+  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
 
+  if (DCI.isBeforeLegalizeOps())
     return SDValue();
-  }
 
-  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
   SDValue Add = N->getOperand(1);
 
   if (Add.getOpcode() != ISD::ADD)
@@ -1187,8 +1058,6 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return performAssertZextCombine(N, DAG, DCI, Subtarget);
   case ISD::SHL:
     return performSHLCombine(N, DAG, DCI, Subtarget);
-  case ISD::SUB:
-    return performSUBCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 4be26dd25dc0..49ae6dd4cd39 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -245,64 +245,46 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
   }
 }
 
-void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const {
-  SDValue InFlag = Node->getOperand(2);
-  unsigned Opc = InFlag.getOpcode();
+void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
+                                        SDValue CmpLHS, const SDLoc &DL,
+                                        SDNode *Node) const {
+  unsigned Opc = InFlag.getOpcode(); (void)Opc;
+
+  assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+          (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+         "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+  unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu;
+  if (Subtarget->isGP64bit()) {
+    SLTuOp = Mips::SLTu64;
+    ADDuOp = Mips::DADDu;
+  }
+
+  SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
   SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
 
-  // In the base case, we can rely on the carry bit from the addsc
-  // instruction.
-  if (Opc == ISD::ADDC) {
-    SDValue Ops[3] = {LHS, RHS, InFlag};
-    CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Ops);
-    return;
+  SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops);
+
+  if (Subtarget->isGP64bit()) {
+    // On 64-bit targets, sltu produces an i64 but our backend currently says
+    // that SLTu64 produces an i32. We need to fix this in the long run but for
+    // now, just make the DAG type-correct by asserting the upper bits are zero.
+    Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT,
+                                   CurDAG->getTargetConstant(0, DL, VT),
+                                   SDValue(Carry, 0),
+                                   CurDAG->getTargetConstant(Mips::sub_32, DL,
+                                                             VT));
   }
 
-  assert(Opc == ISD::ADDE && "ISD::ADDE not in a chain of ADDE nodes!");
-
-  // The more complex case is when there is a chain of ISD::ADDE nodes like:
-  // (adde (adde (adde (addc a b) c) d) e).
-  //
-  // The addwc instruction does not write to the carry bit, instead it writes
-  // to bit 20 of the dsp control register. To match this series of nodes, each
-  // intermediate adde node must be expanded to write the carry bit before the
-  // addition.
-
-  // Start by reading the overflow field for addsc and moving the value to the
-  // carry field. The usage of 1 here with MipsISD::RDDSP / Mips::WRDSP
-  // corresponds to reading/writing the entire control register to/from a GPR.
-
-  SDValue CstOne = CurDAG->getTargetConstant(1, DL, MVT::i32);
-
-  SDValue OuFlag = CurDAG->getTargetConstant(20, DL, MVT::i32);
-
-  SDNode *DSPCtrlField =
-      CurDAG->getMachineNode(Mips::RDDSP, DL, MVT::i32, MVT::Glue, CstOne, InFlag);
-
-  SDNode *Carry = CurDAG->getMachineNode(
-      Mips::EXT, DL, MVT::i32, SDValue(DSPCtrlField, 0), OuFlag, CstOne);
+  // Generate a second addition only if we know that RHS is not a
+  // constant-zero node.
+  SDNode *AddCarry = Carry;
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
+  if (!C || C->getZExtValue())
+    AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS);
 
-  SDValue Ops[4] = {SDValue(DSPCtrlField, 0),
-                    CurDAG->getTargetConstant(6, DL, MVT::i32), CstOne,
-                    SDValue(Carry, 0)};
-  SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops);
-
-  // My reading of the the MIPS DSP 3.01 specification isn't as clear as I
-  // would like about whether bit 20 always gets overwritten by addwc.
-  // Hence take an extremely conservative view and presume it's sticky. We
-  // therefore need to clear it.
-
-  SDValue Zero = CurDAG->getRegister(Mips::ZERO, MVT::i32);
-
-  SDValue InsOps[4] = {Zero, OuFlag, CstOne, SDValue(DSPCFWithCarry, 0)};
-  SDNode *DSPCtrlFinal = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, InsOps);
-
-  SDNode *WrDSP = CurDAG->getMachineNode(Mips::WRDSP, DL, MVT::Glue,
-                                         SDValue(DSPCtrlFinal, 0), CstOne);
-
-  SDValue Operands[3] = {LHS, RHS, SDValue(WrDSP, 0)};
-  CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Operands);
+  CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
 }
 
 /// Match frameindex
@@ -783,8 +765,19 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   switch(Opcode) {
   default: break;
 
+  case ISD::SUBE: {
+    SDValue InFlag = Node->getOperand(2);
+    unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu;
+    selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
+    return true;
+  }
+
   case ISD::ADDE: {
-    selectAddE(Node, DL);
+    if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
+      break;
+    SDValue InFlag = Node->getOperand(2);
+    unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu;
+    selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
     return true;
   }
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index 6f38289c5a45..f89a350cab04 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -41,7 +41,8 @@ private:
                                            const SDLoc &dl, EVT Ty, bool HasLo,
                                            bool HasHi);
 
-  void selectAddE(SDNode *Node, const SDLoc &DL) const;
+  void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
+                      const SDLoc &DL, SDNode *Node) const;
 
   bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
   bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index b57bceb3c837..06a97b9d123e 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -179,6 +179,8 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::LOAD,               MVT::i32, Custom);
   setOperationAction(ISD::STORE,              MVT::i32, Custom);
 
+  setTargetDAGCombine(ISD::ADDE);
+  setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::MUL);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -419,6 +421,163 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
   return MipsTargetLowering::LowerOperation(Op, DAG);
 }
 
+// selectMADD -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+//  (addc multLo, Lo0), (adde multHi, Hi0),
+// where,
+//  multHi/Lo: product of multiplication
+//  Lo0: initial value of Lo register
+//  Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
+  // ADDENode's second operand must be a flag output of an ADDC node in order
+  // for the matching to be successful.
+  SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
+
+  if (ADDCNode->getOpcode() != ISD::ADDC)
+    return false;
+
+  SDValue MultHi = ADDENode->getOperand(0);
+  SDValue MultLo = ADDCNode->getOperand(0);
+  SDNode *MultNode = MultHi.getNode();
+  unsigned MultOpc = MultHi.getOpcode();
+
+  // MultHi and MultLo must be generated by the same node,
+  if (MultLo.getNode() != MultNode)
+    return false;
+
+  // and it must be a multiplication.
+  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+    return false;
+
+  // MultLo amd MultHi must be the first and second output of MultNode
+  // respectively.
+  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+    return false;
+
+  // Transform this to a MADD only if ADDENode and ADDCNode are the only users
+  // of the values of MultNode, in which case MultNode will be removed in later
+  // phases.
+  // If there exist users other than ADDENode or ADDCNode, this function returns
+  // here, which will result in MultNode being mapped to a single MULT
+  // instruction node rather than a pair of MULT and MADD instructions being
+  // produced.
+  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+    return false;
+
+  SDLoc DL(ADDENode);
+
+  // Initialize accumulator.
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  ADDCNode->getOperand(1),
+                                  ADDENode->getOperand(1));
+
+  // create MipsMAdd(u) node
+  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
+
+  SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped,
+                                 MultNode->getOperand(0),// Factor 0
+                                 MultNode->getOperand(1),// Factor 1
+                                 ACCIn);
+
+  // replace uses of adde and addc here
+  if (!SDValue(ADDCNode, 0).use_empty()) {
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
+  }
+  if (!SDValue(ADDENode, 0).use_empty()) {
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
+  }
+
+  return true;
+}
+
+// selectMSUB -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+//  (addc Lo0, multLo), (sube Hi0, multHi),
+// where,
+//  multHi/Lo: product of multiplication
+//  Lo0: initial value of Lo register
+//  Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
+  // SUBENode's second operand must be a flag output of an SUBC node in order
+  // for the matching to be successful.
+  SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
+
+  if (SUBCNode->getOpcode() != ISD::SUBC)
+    return false;
+
+  SDValue MultHi = SUBENode->getOperand(1);
+  SDValue MultLo = SUBCNode->getOperand(1);
+  SDNode *MultNode = MultHi.getNode();
+  unsigned MultOpc = MultHi.getOpcode();
+
+  // MultHi and MultLo must be generated by the same node,
+  if (MultLo.getNode() != MultNode)
+    return false;
+
+  // and it must be a multiplication.
+  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+    return false;
+
+  // MultLo amd MultHi must be the first and second output of MultNode
+  // respectively.
+  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+    return false;
+
+  // Transform this to a MSUB only if SUBENode and SUBCNode are the only users
+  // of the values of MultNode, in which case MultNode will be removed in later
+  // phases.
+  // If there exist users other than SUBENode or SUBCNode, this function returns
+  // here, which will result in MultNode being mapped to a single MULT
+  // instruction node rather than a pair of MULT and MSUB instructions being
+  // produced.
+  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+    return false;
+
+  SDLoc DL(SUBENode);
+
+  // Initialize accumulator.
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  SUBCNode->getOperand(0),
+                                  SUBENode->getOperand(0));
+
+  // create MipsSub(u) node
+  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
+
+  SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
+                                 MultNode->getOperand(0),// Factor 0
+                                 MultNode->getOperand(1),// Factor 1
+                                 ACCIn);
+
+  // replace uses of sube and subc here
+  if (!SDValue(SUBCNode, 0).use_empty()) {
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
+  }
+  if (!SDValue(SUBENode, 0).use_empty()) {
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
+  }
+
+  return true;
+}
+
+static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+      N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 // Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
 //
 // Performs the following transformations:
@@ -661,6 +820,19 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 &&
+      selectMSUB(N, &DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT,
                             EVT ShiftTy, SelectionDAG &DAG) {
   // Clear the upper (64 - VT.sizeInBits) bits.
@@ -938,12 +1110,16 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Val;
 
   switch (N->getOpcode()) {
+  case ISD::ADDE:
+    return performADDECombine(N, DAG, DCI, Subtarget);
   case ISD::AND:
     Val = performANDCombine(N, DAG, DCI, Subtarget);
     break;
   case ISD::OR:
     Val = performORCombine(N, DAG, DCI, Subtarget);
     break;
+  case ISD::SUBE:
+    return performSUBECombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
     return performMULCombine(N, DAG, DCI, this);
   case ISD::SHL:
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index dd7707084948..a64d95512a4a 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -141,9 +141,9 @@ int NVPTXTTIImpl::getArithmeticInstrCost(
   }
 }
 
-void NVPTXTTIImpl::getUnrollingPreferences(Loop *L,
+void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                            TTI::UnrollingPreferences &UP) {
-  BaseT::getUnrollingPreferences(L, UP);
+  BaseT::getUnrollingPreferences(L, SE, UP);
 
   // Enable partial unrolling and runtime unrolling, but reduce the
   // threshold.  This partially unrolls small loops which are often
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 03075b550429..f987892ba675 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -61,7 +61,8 @@ public:
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 6d7eb786a683..7393f3d7a08a 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -131,10 +131,11 @@ public:
     }
   }
 
-  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
-                         const MCValue &Target, bool &IsResolved) override {
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override {
     switch ((PPC::Fixups)Fixup.getKind()) {
-    default: break;
+    default:
+      return false;
     case PPC::fixup_ppc_br24:
     case PPC::fixup_ppc_br24abs:
       // If the target symbol has a local entry point we must not attempt
@@ -147,10 +148,10 @@ public:
           // and thus the shift to pack it.
           unsigned Other = S->getOther() << 2;
           if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0)
-            IsResolved = false;
+            return true;
         }
       }
-      break;
+      return false;
     }
   }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index ae43e59d3cb1..dce443997ea5 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -17,35 +17,31 @@
 namespace llvm {
 namespace PPC {
 enum Fixups {
-  // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b'
-  // and 'bl'.
+  // 24-bit PC relative relocation for direct branches like 'b' and 'bl'.
   fixup_ppc_br24 = FirstTargetFixupKind,
-  
-  /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional
-  /// branches.
+
+  /// 14-bit PC relative relocation for conditional branches.
   fixup_ppc_brcond14,
-  
-  /// fixup_ppc_br24abs - 24-bit absolute relocation for direct branches
-  /// like 'ba' and 'bla'.
+
+  /// 24-bit absolute relocation for direct branches like 'ba' and 'bla'.
   fixup_ppc_br24abs,
 
-  /// fixup_ppc_brcond14abs - 14-bit absolute relocation for conditional
-  /// branches.
+  /// 14-bit absolute relocation for conditional branches.
   fixup_ppc_brcond14abs,
 
-  /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo)
-  /// or ha16(_foo) for instrs like 'li' or 'addis'.
+  /// A 16-bit fixup corresponding to lo16(_foo) or ha16(_foo) for instrs like
+  /// 'li' or 'addis'.
   fixup_ppc_half16,
-  
-  /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with
-  /// implied 2 zero bits for instrs like 'std'.
+
+  /// A 14-bit fixup corresponding to lo16(_foo) with implied 2 zero bits for
+  /// instrs like 'std'.
   fixup_ppc_half16ds,
 
-  /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call
-  /// to __tls_get_addr for the TLS general and local dynamic models,
-  /// or inserts the thread-pointer register number.
+  /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the
+  /// TLS general and local dynamic models, or inserts the thread-pointer
+  /// register number.
   fixup_ppc_nofixup,
-  
+
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index 6d591ca964a6..d5506277ca88 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -219,11 +219,11 @@ bool PPCMachObjectWriter::recordScatteredRelocation(
     const MCSymbol *SB = &B->getSymbol();
 
     if (!SB->getFragment())
-      report_fatal_error("symbol '" + B->getSymbol().getName() +
+      report_fatal_error("symbol '" + SB->getName() +
                          "' can not be undefined in a subtraction expression");
 
     // FIXME: is Type correct? see include/llvm/BinaryFormat/MachO.h
-    Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+    Value2 = Writer->getSymbolAddress(*SB, Layout);
     FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
   }
   // FIXME: does FixedValue get used??
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 07c9c1f9f84c..ad92ac8ce120 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_PPC_H
 #define LLVM_LIB_TARGET_POWERPC_PPC_H
 
+#include "llvm/Support/CodeGen.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 
 // GCC #defines PPC on Linux but we use it as our namespace name
@@ -28,7 +29,7 @@ namespace llvm {
   class AsmPrinter;
   class MCInst;
 
-  FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM);
+  FunctionPass *createPPCCTRLoops();
 #ifndef NDEBUG
   FunctionPass *createPPCCTRLoopsVerify();
 #endif
@@ -41,7 +42,7 @@ namespace llvm {
   FunctionPass *createPPCMIPeepholePass();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCQPXLoadSplatPass();
-  FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
+  FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL);
   FunctionPass *createPPCTLSDynamicCallPass();
   FunctionPass *createPPCBoolRetToIntPass();
   FunctionPass *createPPCExpandISELPass();
@@ -51,6 +52,7 @@ namespace llvm {
   void initializePPCVSXFMAMutatePass(PassRegistry&);
   void initializePPCBoolRetToIntPass(PassRegistry&);
   void initializePPCExpandISELPass(PassRegistry &);
+  void initializePPCTLSDynamicCallPass(PassRegistry &);
   extern char &PPCVSXFMAMutateID;
 
   namespace PPCII {
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 24bc027f8106..094d3e6a61b5 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -24,12 +24,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPC.h"
+#include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
@@ -81,10 +83,7 @@ namespace {
   public:
     static char ID;
 
-    PPCCTRLoops() : FunctionPass(ID), TM(nullptr) {
-      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
-    }
-    PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
+    PPCCTRLoops() : FunctionPass(ID) {
       initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
     }
 
@@ -99,16 +98,18 @@ namespace {
     }
 
   private:
-    bool mightUseCTR(const Triple &TT, BasicBlock *BB);
+    bool mightUseCTR(BasicBlock *BB);
     bool convertToCTRLoop(Loop *L);
 
   private:
-    PPCTargetMachine *TM;
+    const PPCTargetMachine *TM;
+    const PPCSubtarget *STI;
+    const PPCTargetLowering *TLI;
+    const DataLayout *DL;
+    const TargetLibraryInfo *LibInfo;
     LoopInfo *LI;
     ScalarEvolution *SE;
-    const DataLayout *DL;
     DominatorTree *DT;
-    const TargetLibraryInfo *LibInfo;
     bool PreserveLCSSA;
   };
 
@@ -149,9 +150,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                     false, false)
 
-FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) {
-  return new PPCCTRLoops(TM);
-}
+FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); }
 
 #ifndef NDEBUG
 INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
@@ -169,6 +168,14 @@ bool PPCCTRLoops::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  TM = &TPC->getTM<PPCTargetMachine>();
+  STI = TM->getSubtargetImpl(F);
+  TLI = STI->getTargetLowering();
+
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -198,8 +205,7 @@ static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
 
 // Determining the address of a TLS variable results in a function call in
 // certain TLS models.
-static bool memAddrUsesCTR(const PPCTargetMachine *TM,
-                           const Value *MemAddr) {
+static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) {
   const auto *GV = dyn_cast<GlobalValue>(MemAddr);
   if (!GV) {
     // Recurse to check for constants that refer to TLS global variables.
@@ -213,35 +219,35 @@ static bool memAddrUsesCTR(const PPCTargetMachine *TM,
 
   if (!GV->isThreadLocal())
     return false;
-  if (!TM)
-    return true;
-  TLSModel::Model Model = TM->getTLSModel(GV);
+  TLSModel::Model Model = TM.getTLSModel(GV);
   return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
 }
 
-bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
+// Loop through the inline asm constraints and look for something that clobbers
+// ctr.
+static bool asmClobbersCTR(InlineAsm *IA) {
+  InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+  for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+    InlineAsm::ConstraintInfo &C = CIV[i];
+    if (C.Type != InlineAsm::isInput)
+      for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+        if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+          return true;
+  }
+  return false;
+}
+
+bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
   for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
        J != JE; ++J) {
     if (CallInst *CI = dyn_cast<CallInst>(J)) {
+      // Inline ASM is okay, unless it clobbers the ctr register.
       if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
-        // Inline ASM is okay, unless it clobbers the ctr register.
-        InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
-        for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
-          InlineAsm::ConstraintInfo &C = CIV[i];
-          if (C.Type != InlineAsm::isInput)
-            for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
-              if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
-                return true;
-        }
-
+	if (asmClobbersCTR(IA))
+	  return true;
         continue;
       }
 
-      if (!TM)
-        return true;
-      const TargetLowering *TLI =
-          TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
-
       if (Function *F = CI->getCalledFunction()) {
         // Most intrinsics don't become function calls, but some might.
         // sin, cos, exp and log are always calls.
@@ -380,9 +386,8 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
         }
 
         if (Opcode) {
-          auto &DL = CI->getModule()->getDataLayout();
-          MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(),
-                                            true);
+          MVT VTy = TLI->getSimpleValueType(
+              *DL, CI->getArgOperand(0)->getType(), true);
           if (VTy == MVT::Other)
             return true;
 
@@ -406,17 +411,17 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
       CastInst *CI = cast<CastInst>(J);
       if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
           CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
-          isLargeIntegerTy(TT.isArch32Bit(), CI->getSrcTy()->getScalarType()) ||
-          isLargeIntegerTy(TT.isArch32Bit(), CI->getDestTy()->getScalarType()))
+          isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) ||
+          isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType()))
         return true;
-    } else if (isLargeIntegerTy(TT.isArch32Bit(),
+    } else if (isLargeIntegerTy(!TM->isPPC64(),
                                 J->getType()->getScalarType()) &&
                (J->getOpcode() == Instruction::UDiv ||
                 J->getOpcode() == Instruction::SDiv ||
                 J->getOpcode() == Instruction::URem ||
                 J->getOpcode() == Instruction::SRem)) {
       return true;
-    } else if (TT.isArch32Bit() &&
+    } else if (!TM->isPPC64() &&
                isLargeIntegerTy(false, J->getType()->getScalarType()) &&
                (J->getOpcode() == Instruction::Shl ||
                 J->getOpcode() == Instruction::AShr ||
@@ -428,16 +433,11 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
       // On PowerPC, indirect jumps use the counter register.
       return true;
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
-      if (!TM)
-        return true;
-      const TargetLowering *TLI =
-          TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
-
       if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
         return true;
     }
 
-    if (TM->getSubtargetImpl(*BB->getParent())->getTargetLowering()->useSoftFloat()) {
+    if (STI->useSoftFloat()) {
       switch(J->getOpcode()) {
       case Instruction::FAdd:
       case Instruction::FSub:
@@ -456,7 +456,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
     }
 
     for (Value *Operand : J->operands())
-      if (memAddrUsesCTR(TM, Operand))
+      if (memAddrUsesCTR(*TM, Operand))
         return true;
   }
 
@@ -466,11 +466,6 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
 bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   bool MadeChange = false;
 
-  const Triple TT =
-      Triple(L->getHeader()->getParent()->getParent()->getTargetTriple());
-  if (!TT.isArch32Bit() && !TT.isArch64Bit())
-    return MadeChange; // Unknown arch. type.
-
   // Process nested loops first.
   for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
     MadeChange |= convertToCTRLoop(*I);
@@ -495,7 +490,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // want to use the counter register if the loop contains calls.
   for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
        I != IE; ++I)
-    if (mightUseCTR(TT, *I))
+    if (mightUseCTR(*I))
       return MadeChange;
 
   SmallVector<BasicBlock*, 4> ExitingBlocks;
@@ -517,7 +512,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
     } else if (!SE->isLoopInvariant(EC, L))
       continue;
 
-    if (SE->getTypeSizeInBits(EC->getType()) > (TT.isArch64Bit() ? 64 : 32))
+    if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
       continue;
 
     // We now have a loop-invariant count of loop iterations (which is not the
@@ -571,7 +566,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // preheader, then we can use it (except if the preheader contains a use of
   // the CTR register because some such uses might be reordered by the
   // selection DAG after the mtctr instruction).
-  if (!Preheader || mightUseCTR(TT, Preheader))
+  if (!Preheader || mightUseCTR(Preheader))
     Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
   if (!Preheader)
     return MadeChange;
@@ -582,10 +577,9 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // selected branch.
   MadeChange = true;
 
-  SCEVExpander SCEVE(*SE, Preheader->getModule()->getDataLayout(), "loopcnt");
+  SCEVExpander SCEVE(*SE, *DL, "loopcnt");
   LLVMContext &C = SE->getContext();
-  Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) :
-                                       Type::getInt32Ty(C);
+  Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
   if (!ExitCount->getType()->isPointerTy() &&
       ExitCount->getType() != CountType)
     ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index afd2e87078a9..535b9deaefac 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -114,8 +114,8 @@ namespace {
     unsigned GlobalBaseReg;
 
   public:
-    explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
-        : SelectionDAGISel(tm), TM(tm) {}
+    explicit PPCDAGToDAGISel(PPCTargetMachine &tm, CodeGenOpt::Level OptLevel)
+        : SelectionDAGISel(tm, OptLevel), TM(tm) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Make sure we re-emit a set of the global base reg if necessary
@@ -5116,6 +5116,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
 /// createPPCISelDag - This pass converts a legalized DAG into a
 /// PowerPC-specific DAG, ready for instruction scheduling.
 ///
-FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
-  return new PPCDAGToDAGISel(TM);
+FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new PPCDAGToDAGISel(TM, OptLevel);
 }
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 31c50785c2ee..5f8085f4626e 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -52,6 +52,7 @@ namespace {
 protected:
     bool processBlock(MachineBasicBlock &MBB) {
       bool Changed = false;
+      bool NeedFence = true;
       bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
 
       for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
@@ -62,6 +63,16 @@ protected:
             MI.getOpcode() != PPC::ADDItlsldLADDR &&
             MI.getOpcode() != PPC::ADDItlsgdLADDR32 &&
             MI.getOpcode() != PPC::ADDItlsldLADDR32) {
+
+          // Although we create ADJCALLSTACKDOWN and ADJCALLSTACKUP
+          // as scheduling fences, we skip creating fences if we already
+          // have existing ADJCALLSTACKDOWN/UP to avoid nesting,
+          // which causes verification error with -verify-machineinstrs.
+          if (MI.getOpcode() == PPC::ADJCALLSTACKDOWN)
+            NeedFence = false;
+          else if (MI.getOpcode() == PPC::ADJCALLSTACKUP)
+            NeedFence = true;
+
           ++I;
           continue;
         }
@@ -96,11 +107,15 @@ protected:
           break;
         }
 
-        // Don't really need to save data to the stack - the clobbered
+        // We create ADJCALLSTACKUP and ADJCALLSTACKDOWN around _tls_get_addr
+        // as schduling fence to avoid it is scheduled before
+        // mflr in the prologue and the address in LR is clobbered (PR25839).
+        // We don't really need to save data to the stack - the clobbered
         // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
         // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
-        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
-                                                            .addImm(0);
+        if (NeedFence)
+          BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
+                                                              .addImm(0);
 
         // Expand into two ops built prior to the existing instruction.
         MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
@@ -116,7 +131,8 @@ protected:
                               .addReg(GPR3));
         Call->addOperand(MI.getOperand(3));
 
-        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
+        if (NeedFence)
+          BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
 
         BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg)
           .addReg(GPR3);
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index a88a6541e8d0..fe092cc3b858 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -93,6 +93,7 @@ extern "C" void LLVMInitializePowerPCTarget() {
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializePPCBoolRetToIntPass(PR);
   initializePPCExpandISELPass(PR);
+  initializePPCTLSDynamicCallPass(PR);
 }
 
 /// Return the datalayout string of a subtarget.
@@ -336,7 +337,7 @@ bool PPCPassConfig::addPreISel() {
     addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
 
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
-    addPass(createPPCCTRLoops(getPPCTargetMachine()));
+    addPass(createPPCCTRLoops());
 
   return false;
 }
@@ -352,7 +353,7 @@ bool PPCPassConfig::addILPOpts() {
 
 bool PPCPassConfig::addInstSelector() {
   // Install an instruction selector.
-  addPass(createPPCISelDag(getPPCTargetMachine()));
+  addPass(createPPCISelDag(getPPCTargetMachine(), getOptLevel()));
 
 #ifndef NDEBUG
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 5eb6ba785d1b..2dc3828334ac 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -41,6 +41,7 @@ public:
   ~PPCTargetMachine() override;
 
   const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
+  const PPCSubtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 3dbd5f5b9a92..6110706b01b9 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -189,7 +189,7 @@ int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
   return PPCTTIImpl::getIntImmCost(Imm, Ty);
 }
 
-void PPCTTIImpl::getUnrollingPreferences(Loop *L,
+void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   if (ST->getDarwinDirective() == PPC::DIR_A2) {
     // The A2 is in-order with a deep pipeline, and concatenation unrolling
@@ -201,7 +201,7 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L,
     UP.AllowExpensiveTripCount = true;
   }
 
-  BaseT::getUnrollingPreferences(L, UP);
+  BaseT::getUnrollingPreferences(L, SE, UP);
 }
 
 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 758c335def08..99ca6394d1be 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -52,7 +52,8 @@ public:
                     Type *Ty);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   /// @}
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index c72b47b09085..d4454c271f5a 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -203,13 +203,14 @@ namespace {
       return InfosBE[Kind - FirstTargetFixupKind];
     }
 
-    void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
-                           const MCValue &Target, bool &IsResolved) override {
+    bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target) override {
       switch ((Sparc::Fixups)Fixup.getKind()) {
-      default: break;
+      default:
+        return false;
       case Sparc::fixup_sparc_wplt30:
         if (Target.getSymA()->getSymbol().isTemporary())
-          return;
+          return false;
       case Sparc::fixup_sparc_tls_gd_hi22:
       case Sparc::fixup_sparc_tls_gd_lo10:
       case Sparc::fixup_sparc_tls_gd_add:
@@ -227,7 +228,8 @@ namespace {
       case Sparc::fixup_sparc_tls_ie_ldx:
       case Sparc::fixup_sparc_tls_ie_add:
       case Sparc::fixup_sparc_tls_le_hix22:
-      case Sparc::fixup_sparc_tls_le_lox10:  IsResolved = false; break;
+      case Sparc::fixup_sparc_tls_le_lox10:
+        return true;
       }
     }
 
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index ad05779a9f64..ee23692ad1db 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -61,6 +61,7 @@ enum RegisterKind {
   VR64Reg,
   VR128Reg,
   AR32Reg,
+  CR64Reg,
 };
 
 enum MemoryKind {
@@ -343,6 +344,7 @@ public:
   bool isVF128() const { return false; }
   bool isVR128() const { return isReg(VR128Reg); }
   bool isAR32() const { return isReg(AR32Reg); }
+  bool isCR64() const { return isReg(CR64Reg); }
   bool isAnyReg() const { return (isReg() || isImm(0, 15)); }
   bool isBDAddr32Disp12() const { return isMemDisp12(BDMem, ADDR32Reg); }
   bool isBDAddr32Disp20() const { return isMemDisp20(BDMem, ADDR32Reg); }
@@ -379,7 +381,8 @@ private:
     RegGR,
     RegFP,
     RegV,
-    RegAR
+    RegAR,
+    RegCR
   };
   struct Register {
     RegisterGroup Group;
@@ -487,6 +490,9 @@ public:
   OperandMatchResultTy parseAR32(OperandVector &Operands) {
     return parseRegister(Operands, RegAR, SystemZMC::AR32Regs, AR32Reg);
   }
+  OperandMatchResultTy parseCR64(OperandVector &Operands) {
+    return parseRegister(Operands, RegCR, SystemZMC::CR64Regs, CR64Reg);
+  }
   OperandMatchResultTy parseAnyReg(OperandVector &Operands) {
     return parseAnyRegister(Operands);
   }
@@ -648,6 +654,8 @@ bool SystemZAsmParser::parseRegister(Register &Reg) {
     Reg.Group = RegV;
   else if (Prefix == 'a' && Reg.Num < 16)
     Reg.Group = RegAR;
+  else if (Prefix == 'c' && Reg.Num < 16)
+    Reg.Group = RegCR;
   else
     return Error(Reg.StartLoc, "invalid register");
 
@@ -741,6 +749,10 @@ SystemZAsmParser::parseAnyRegister(OperandVector &Operands) {
       Kind = AR32Reg;
       RegNo = SystemZMC::AR32Regs[Reg.Num];
     }
+    else if (Reg.Group == RegCR) {
+      Kind = CR64Reg;
+      RegNo = SystemZMC::CR64Regs[Reg.Num];
+    }
     else {
       return MatchOperand_ParseFail;
     }
@@ -1056,6 +1068,8 @@ bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
     RegNo = SystemZMC::VR128Regs[Reg.Num];
   else if (Reg.Group == RegAR)
     RegNo = SystemZMC::AR32Regs[Reg.Num];
+  else if (Reg.Group == RegCR)
+    RegNo = SystemZMC::CR64Regs[Reg.Num];
   StartLoc = Reg.StartLoc;
   EndLoc = Reg.EndLoc;
   return false;
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 27fd70bc6092..8903b57ffd0b 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -162,6 +162,12 @@ static DecodeStatus DecodeAR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
   return decodeRegisterClass(Inst, RegNo, SystemZMC::AR32Regs, 16);
 }
 
+static DecodeStatus DecodeCR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::CR64Regs, 16);
+}
+
 template<unsigned N>
 static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm) {
   if (!isUInt<N>(Imm))
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index dfea7e33fa15..727ab921daf9 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -116,6 +116,13 @@ const unsigned SystemZMC::AR32Regs[16] = {
   SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15
 };
 
+const unsigned SystemZMC::CR64Regs[16] = {
+  SystemZ::C0, SystemZ::C1, SystemZ::C2, SystemZ::C3,
+  SystemZ::C4, SystemZ::C5, SystemZ::C6, SystemZ::C7,
+  SystemZ::C8, SystemZ::C9, SystemZ::C10, SystemZ::C11,
+  SystemZ::C12, SystemZ::C13, SystemZ::C14, SystemZ::C15
+};
+
 unsigned SystemZMC::getFirstReg(unsigned Reg) {
   static unsigned Map[SystemZ::NUM_TARGET_REGS];
   static bool Initialized = false;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index d9926c7e4986..dbca3485290a 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -55,6 +55,7 @@ extern const unsigned VR32Regs[32];
 extern const unsigned VR64Regs[32];
 extern const unsigned VR128Regs[32];
 extern const unsigned AR32Regs[16];
+extern const unsigned CR64Regs[16];
 
 // Return the 0-based number of the first architectural register that
 // contains the given LLVM register.   E.g. R1D -> 1.
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index 74cf653b9d95..9b714157550d 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -67,6 +67,11 @@ We don't use ICM, STCM, or CLM.
 
 --
 
+We don't use ADD (LOGICAL) HIGH, SUBTRACT (LOGICAL) HIGH,
+or COMPARE (LOGICAL) HIGH yet.
+
+--
+
 DAGCombiner doesn't yet fold truncations of extended loads.  Functions like:
 
     unsigned long f (unsigned long x, unsigned short *y)
diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
index c5f324418da5..41300a1b6295 100644
--- a/lib/Target/SystemZ/SystemZ.td
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -56,6 +56,7 @@ include "SystemZInstrVector.td"
 include "SystemZInstrFP.td"
 include "SystemZInstrHFP.td"
 include "SystemZInstrDFP.td"
+include "SystemZInstrSystem.td"
 
 def SystemZInstrInfo : InstrInfo {}
 
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index ffb0b8d1c861..c5faa0d62881 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -68,11 +68,21 @@ def FeaturePopulationCount : SystemZFeature<
   "Assume that the population-count facility is installed"
 >;
 
+def FeatureMessageSecurityAssist3 : SystemZFeature<
+  "message-security-assist-extension3", "MessageSecurityAssist3",
+  "Assume that the message-security-assist extension facility 3 is installed"
+>;
+
 def FeatureMessageSecurityAssist4 : SystemZFeature<
   "message-security-assist-extension4", "MessageSecurityAssist4",
   "Assume that the message-security-assist extension facility 4 is installed"
 >;
 
+def FeatureResetReferenceBitsMultiple : SystemZFeature<
+  "reset-reference-bits-multiple", "ResetReferenceBitsMultiple",
+  "Assume that the reset-reference-bits-multiple facility is installed"
+>;
+
 def Arch9NewFeatures : SystemZFeatureList<[
     FeatureDistinctOps,
     FeatureFastSerialization,
@@ -81,7 +91,9 @@ def Arch9NewFeatures : SystemZFeatureList<[
     FeatureInterlockedAccess1,
     FeatureLoadStoreOnCond,
     FeaturePopulationCount,
-    FeatureMessageSecurityAssist4
+    FeatureMessageSecurityAssist3,
+    FeatureMessageSecurityAssist4,
+    FeatureResetReferenceBitsMultiple
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -120,13 +132,19 @@ def FeatureDFPZonedConversion : SystemZFeature<
   "Assume that the DFP zoned-conversion facility is installed"
 >;
 
+def FeatureEnhancedDAT2 : SystemZFeature<
+  "enhanced-dat-2", "EnhancedDAT2",
+  "Assume that the enhanced-DAT facility 2 is installed"
+>;
+
 def Arch10NewFeatures : SystemZFeatureList<[
     FeatureExecutionHint,
     FeatureLoadAndTrap,
     FeatureMiscellaneousExtensions,
     FeatureProcessorAssist,
     FeatureTransactionalExecution,
-    FeatureDFPZonedConversion
+    FeatureDFPZonedConversion,
+    FeatureEnhancedDAT2
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 5f6115ed86a4..7620e06ccbc9 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2468,6 +2468,14 @@ class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let OpType = "reg";
 }
 
+class UnaryTiedRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src),
+            mnemonic#"\t$R1", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let R2 = 0;
+}
+
 class UnaryMemRRFc<string mnemonic, bits<16> opcode,
                    RegisterOperand cls1, RegisterOperand cls2>
   : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src),
@@ -2702,6 +2710,26 @@ class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode,
   let AddedComplexity = 7;
 }
 
+class SideEffectBinaryRRE<string mnemonic, bits<16> opcode,
+                          RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+            mnemonic#"\t$R1, $R2", []>;
+
+class SideEffectBinaryRRFa<string mnemonic, bits<16> opcode,
+                           RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+             mnemonic#"\t$R1, $R2", []> {
+  let R3 = 0;
+  let M4 = 0;
+}
+
+class SideEffectBinaryRRFc<string mnemonic, bits<16> opcode,
+                           RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+             mnemonic#"\t$R1, $R2", []> {
+  let M3 = 0;
+}
+
 class SideEffectBinaryIE<string mnemonic, bits<16> opcode,
                          Immediate imm1, Immediate imm2>
   : InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2),
@@ -2729,6 +2757,10 @@ class SideEffectBinarySSf<string mnemonic, bits<8> opcode>
   : InstSSf<opcode, (outs), (ins bdaddr12only:$BD1, bdladdr12onlylen8:$BDL2),
             mnemonic##"\t$BD1, $BDL2", []>;
 
+class SideEffectBinarySSE<string mnemonic, bits<16> opcode>
+  : InstSSE<opcode, (outs), (ins bdaddr12only:$BD1, bdaddr12only:$BD2),
+            mnemonic#"\t$BD1, $BD2", []>;
+
 class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode,
                                RegisterOperand cls1, RegisterOperand cls2>
   : InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
@@ -3612,6 +3644,22 @@ class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
                                  shift12only:$BD2, imm32zx4:$I3),
             mnemonic##"\t$BDL1, $BD2, $I3", []>;
 
+class SideEffectTernaryRRFa<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls1, RegisterOperand cls2,
+                            RegisterOperand cls3>
+  : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3),
+             mnemonic#"\t$R1, $R2, $R3", []> {
+  let M4 = 0;
+}
+
+class SideEffectTernaryRRFb<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls1, RegisterOperand cls2,
+                            RegisterOperand cls3>
+  : InstRRFb<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3),
+             mnemonic#"\t$R1, $R3, $R2", []> {
+  let M4 = 0;
+}
+
 class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode,
                                      RegisterOperand cls1,
                                      RegisterOperand cls2,
@@ -3630,6 +3678,13 @@ class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode,
   : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3),
              mnemonic#"\t$R1, $R2, $M3", []>;
 
+multiclass SideEffectTernaryRRFcOpt<string mnemonic, bits<16> opcode,
+                                    RegisterOperand cls1,
+                                    RegisterOperand cls2> {
+  def "" : SideEffectTernaryRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+  def Opt : SideEffectBinaryRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
 class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode,
                                   RegisterOperand cls1, RegisterOperand cls2,
                                   Immediate imm>
@@ -3720,6 +3775,18 @@ multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
   }
 }
 
+class SideEffectTernaryRS<string mnemonic, bits<8> opcode,
+                          RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSa<opcode, (outs),
+            (ins cls1:$R1, cls2:$R3, bdaddr12only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []>;
+
+class SideEffectTernaryRSY<string mnemonic, bits<16> opcode,
+                           RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSYa<opcode, (outs),
+             (ins cls1:$R1, cls2:$R3, bdaddr20only:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2", []>;
+
 class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode,
                                 RegisterOperand cls1, RegisterOperand cls2>
   : InstRSa<opcode, (outs cls1:$R1, cls2:$R3),
@@ -3997,6 +4064,35 @@ multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> {
                                             VR128:$V4, imm32zx4:$M5, 0)>;
 }
 
+class SideEffectQuaternaryRRFa<string mnemonic, bits<16> opcode,
+                               RegisterOperand cls1, RegisterOperand cls2,
+                               RegisterOperand cls3>
+  : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R2, $R3, $M4", []>;
+
+multiclass SideEffectQuaternaryRRFaOptOpt<string mnemonic, bits<16> opcode,
+                                          RegisterOperand cls1,
+                                          RegisterOperand cls2,
+                                          RegisterOperand cls3> {
+  def "" : SideEffectQuaternaryRRFa<mnemonic, opcode, cls1, cls2, cls3>;
+  def Opt : SideEffectTernaryRRFa<mnemonic, opcode, cls1, cls2, cls3>;
+  def OptOpt : SideEffectBinaryRRFa<mnemonic, opcode, cls1, cls2>;
+}
+
+class SideEffectQuaternaryRRFb<string mnemonic, bits<16> opcode,
+                               RegisterOperand cls1, RegisterOperand cls2,
+                               RegisterOperand cls3>
+  : InstRRFb<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R3, $R2, $M4", []>;
+
+multiclass SideEffectQuaternaryRRFbOpt<string mnemonic, bits<16> opcode,
+                                       RegisterOperand cls1,
+                                       RegisterOperand cls2,
+                                       RegisterOperand cls3> {
+  def "" : SideEffectQuaternaryRRFb<mnemonic, opcode, cls1, cls2, cls3>;
+  def Opt : SideEffectTernaryRRFb<mnemonic, opcode, cls1, cls2, cls3>;
+}
+
 class SideEffectQuaternarySSe<string mnemonic, bits<8> opcode,
                               RegisterOperand cls>
   : InstSSe<opcode, (outs),
@@ -4012,6 +4108,16 @@ class LoadAndOpRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let mayStore = 1;
 }
 
+class CmpSwapRRE<string mnemonic, bits<16> opcode,
+                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+            mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
 class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 RegisterOperand cls, AddressingMode mode = bdaddr12only>
   : InstRSa<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 9f5e6288348e..98f66c29ae64 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -883,6 +883,12 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   }
   def AGFR : BinaryRRE<"agfr", 0xB918, null_frag, GR64, GR32>;
 
+  // Addition to a high register.
+  def AHHHR : BinaryRRFa<"ahhhr", 0xB9C8, null_frag, GRH32, GRH32, GRH32>,
+              Requires<[FeatureHighWord]>;
+  def AHHLR : BinaryRRFa<"ahhlr", 0xB9D8, null_frag, GRH32, GRH32, GR32>,
+              Requires<[FeatureHighWord]>;
+
   // Addition of signed 16-bit immediates.
   defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>;
   defm AHI  : BinaryRIAndK<"ahi",  0xA7A, 0xECD8, add, GR32, imm32sx16>;
@@ -917,6 +923,12 @@ let Defs = [CC] in {
   }
   def ALGFR : BinaryRRE<"algfr", 0xB91A, null_frag, GR64, GR32>;
 
+  // Addition to a high register.
+  def ALHHHR : BinaryRRFa<"alhhhr", 0xB9CA, null_frag, GRH32, GRH32, GRH32>,
+               Requires<[FeatureHighWord]>;
+  def ALHHLR : BinaryRRFa<"alhhlr", 0xB9DA, null_frag, GRH32, GRH32, GR32>,
+               Requires<[FeatureHighWord]>;
+
   // Addition of signed 16-bit immediates.
   def ALHSIK  : BinaryRIE<"alhsik",  0xECDA, addc, GR32, imm32sx16>,
                 Requires<[FeatureDistinctOps]>;
@@ -927,6 +939,10 @@ let Defs = [CC] in {
   def ALFI  : BinaryRIL<"alfi",  0xC2B, addc, GR32, uimm32>;
   def ALGFI : BinaryRIL<"algfi", 0xC2A, addc, GR64, imm64zx32>;
 
+  // Addition of signed 32-bit immediates.
+  def ALSIH : BinaryRIL<"alsih", 0xCCA, null_frag, GRH32, simm32>,
+              Requires<[FeatureHighWord]>;
+
   // Addition of memory.
   defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
   def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
@@ -949,6 +965,10 @@ let Defs = [CC], Uses = [CC] in {
   def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load, 8>;
 }
 
+// Addition that does not modify the condition code.
+def ALSIHN : BinaryRIL<"alsihn", 0xCCB, null_frag, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
+
 //===----------------------------------------------------------------------===//
 // Subtraction
 //===----------------------------------------------------------------------===//
@@ -961,6 +981,12 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   def SGFR : BinaryRRE<"sgfr", 0xB919, null_frag, GR64, GR32>;
   defm SGR : BinaryRREAndK<"sgr", 0xB909, 0xB9E9, sub, GR64, GR64>;
 
+  // Subtraction from a high register.
+  def SHHHR : BinaryRRFa<"shhhr", 0xB9C9, null_frag, GRH32, GRH32, GRH32>,
+              Requires<[FeatureHighWord]>;
+  def SHHLR : BinaryRRFa<"shhlr", 0xB9D9, null_frag, GRH32, GRH32, GR32>,
+              Requires<[FeatureHighWord]>;
+
   // Subtraction of memory.
   defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
   defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
@@ -976,6 +1002,12 @@ let Defs = [CC] in {
   def SLGFR : BinaryRRE<"slgfr", 0xB91B, null_frag, GR64, GR32>;
   defm SLGR : BinaryRREAndK<"slgr", 0xB90B, 0xB9EB, subc, GR64, GR64>;
 
+  // Subtraction from a high register.
+  def SLHHHR : BinaryRRFa<"slhhhr", 0xB9CB, null_frag, GRH32, GRH32, GRH32>,
+               Requires<[FeatureHighWord]>;
+  def SLHHLR : BinaryRRFa<"slhhlr", 0xB9DB, null_frag, GRH32, GRH32, GR32>,
+               Requires<[FeatureHighWord]>;
+
   // Subtraction of unsigned 32-bit immediates.  These don't match
   // subc because we prefer addc for constants.
   def SLFI  : BinaryRIL<"slfi",  0xC25, null_frag, GR32, uimm32>;
@@ -1298,6 +1330,12 @@ let Defs = [CC], CCValues = 0xE in {
   def CGFR : CompareRRE<"cgfr", 0xB930, null_frag, GR64, GR32>;
   def CGR  : CompareRRE<"cgr",  0xB920, z_scmp,    GR64, GR64>;
 
+  // Comparison with a high register.
+  def CHHR : CompareRRE<"chhr", 0xB9CD, null_frag, GRH32, GRH32>,
+             Requires<[FeatureHighWord]>;
+  def CHLR : CompareRRE<"chlr", 0xB9DD, null_frag, GRH32, GR32>,
+             Requires<[FeatureHighWord]>;
+
   // Comparison with a signed 16-bit immediate.  CHIMux expands to CHI or CIH,
   // depending on the choice of register.
   def CHIMux : CompareRIPseudo<z_scmp, GRX32, imm32sx16>,
@@ -1344,6 +1382,12 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
   def CLGFR : CompareRRE<"clgfr", 0xB931, null_frag, GR64, GR32>;
   def CLGR  : CompareRRE<"clgr",  0xB921, z_ucmp,    GR64, GR64>;
 
+  // Comparison with a high register.
+  def CLHHR : CompareRRE<"clhhr", 0xB9CF, null_frag, GRH32, GRH32>,
+              Requires<[FeatureHighWord]>;
+  def CLHLR : CompareRRE<"clhlr", 0xB9DF, null_frag, GRH32, GR32>,
+              Requires<[FeatureHighWord]>;
+
   // Comparison with an unsigned 32-bit immediate.  CLFIMux expands to CLFI
   // or CLIH, depending on the choice of register.
   def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>,
@@ -1888,54 +1932,12 @@ let mayLoad = 1, Defs = [CC] in
 let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in
   def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>;
 
-// Supervisor call.
-let hasSideEffects = 1, isCall = 1, Defs = [CC] in
-  def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
-
-// Monitor call.
-let hasSideEffects = 1, isCall = 1 in
-  def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>;
-
-// Store clock.
-let hasSideEffects = 1, Defs = [CC] in {
-  def STCK  : StoreInherentS<"stck",  0xB205, null_frag, 8>;
-  def STCKF : StoreInherentS<"stckf", 0xB27C, null_frag, 8>;
-  def STCKE : StoreInherentS<"stcke", 0xB278, null_frag, 16>;
-}
-
-// Store facility list.
-let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
-  def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
-
-// Extract CPU attribute.
-let hasSideEffects = 1 in
-  def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>;
-
-// Extract CPU time.
-let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in
-  def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
-
-// Extract PSW.
-let hasSideEffects = 1, Uses = [CC] in
-  def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>;
-
 // Execute.
 let hasSideEffects = 1 in {
   def EX   : SideEffectBinaryRX<"ex", 0x44, GR64>;
   def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, GR64>;
 }
 
-// Program return.
-let hasSideEffects = 1, Defs = [CC] in
-  def PR : SideEffectInherentE<"pr", 0x0101>;
-
-// Move with key.
-let mayLoad = 1, mayStore = 1, Defs = [CC] in
-  def MVCK : MemoryBinarySSd<"mvck", 0xD9, GR64>;
-
-// Store real address.
-def STRAG : StoreSSE<"strag", 0xE502>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td
new file mode 100644
index 000000000000..a9803c2d83e9
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrSystem.td
@@ -0,0 +1,517 @@
+//==- SystemZInstrSystem.td - SystemZ system instructions -*- tblgen-*-----==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The instructions in this file implement SystemZ system-level instructions.
+// Most of these instructions are privileged or semi-privileged.  They are
+// not used for code generation, but are provided for use with the assembler
+// and disassembler only.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Program-Status Word Instructions.
+//===----------------------------------------------------------------------===//
+
+// Extract PSW.
+let hasSideEffects = 1, Uses = [CC] in
+  def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>;
+
+// Load PSW (extended).
+let hasSideEffects = 1, Defs = [CC], mayLoad = 1 in {
+  def LPSW : SideEffectUnaryS<"lpsw", 0x8200, null_frag, 8>;
+  def LPSWE : SideEffectUnaryS<"lpswe", 0xB2B2, null_frag, 16>;
+}
+
+// Insert PSW key.
+let Uses = [R2L], Defs = [R2L] in
+  def IPK : SideEffectInherentS<"ipk", 0xB20B, null_frag>;
+
+// Set PSW key from address.
+let hasSideEffects = 1 in
+  def SPKA : SideEffectAddressS<"spka", 0xB20A, null_frag>;
+
+// Set system mask.
+let hasSideEffects = 1, mayLoad = 1 in
+  def SSM : SideEffectUnaryS<"ssm", 0x8000, null_frag, 1>;
+
+// Store then AND/OR system mask.
+let hasSideEffects = 1 in {
+  def STNSM : StoreSI<"stnsm", 0xAC, null_frag, imm32zx8>;
+  def STOSM : StoreSI<"stosm", 0xAD, null_frag, imm32zx8>;
+}
+
+// Insert address space control.
+let hasSideEffects = 1 in
+  def IAC : InherentRRE<"iac", 0xB224, GR32, null_frag>;
+
+// Set address space control (fast).
+let hasSideEffects = 1 in {
+  def SAC : SideEffectAddressS<"sac", 0xB219, null_frag>;
+  def SACF : SideEffectAddressS<"sacf", 0xB279, null_frag>;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Register Instructions.
+//===----------------------------------------------------------------------===//
+
+// Load control.
+def LCTL : LoadMultipleRS<"lctl", 0xB7, CR64>;
+def LCTLG : LoadMultipleRSY<"lctlg", 0xEB2F, CR64>;
+
+// Store control.
+def STCTL : StoreMultipleRS<"stctl", 0xB6, CR64>;
+def STCTG : StoreMultipleRSY<"stctg", 0xEB25, CR64>;
+
+// Extract primary ASN (and instance).
+let hasSideEffects = 1 in {
+  def EPAR : InherentRRE<"epar", 0xB226, GR32, null_frag>;
+  def EPAIR : InherentRRE<"epair", 0xB99A, GR64, null_frag>;
+}
+
+// Extract secondary ASN (and instance).
+let hasSideEffects = 1 in {
+  def ESAR : InherentRRE<"esar", 0xB227, GR32, null_frag>;
+  def ESAIR : InherentRRE<"esair", 0xB99B, GR64, null_frag>;
+}
+
+// Set secondary ASN (and instance).
+let hasSideEffects = 1 in {
+  def SSAR : SideEffectUnaryRRE<"ssar", 0xB225, GR32, null_frag>;
+  def SSAIR : SideEffectUnaryRRE<"ssair", 0xB99F, GR64, null_frag>;
+}
+
+// Extract and set extended authority.
+let hasSideEffects = 1 in
+  def ESEA : UnaryTiedRRE<"esea", 0xB99D, GR32>;
+
+//===----------------------------------------------------------------------===//
+// Prefix-Register Instructions.
+//===----------------------------------------------------------------------===//
+
+// Set prefix.
+let hasSideEffects = 1 in
+  def SPX : SideEffectUnaryS<"spx", 0xB210, null_frag, 4>;
+
+// Store prefix.
+let hasSideEffects = 1 in
+  def STPX : StoreInherentS<"stpx", 0xB211, null_frag, 4>;
+
+//===----------------------------------------------------------------------===//
+// Storage-Key and Real Memory Instructions.
+//===----------------------------------------------------------------------===//
+
+// Insert storage key extended.
+let hasSideEffects = 1 in
+  def ISKE : BinaryRRE<"iske", 0xB229, null_frag, GR32, GR64>;
+
+// Insert virtual storage key.
+let hasSideEffects = 1 in
+  def IVSK : BinaryRRE<"ivsk", 0xB223, null_frag, GR32, GR64>;
+
+// Set storage key extended.
+let hasSideEffects = 1, Defs = [CC] in
+  defm SSKE : SideEffectTernaryRRFcOpt<"sske", 0xB22B, GR32, GR64>;
+
+// Reset reference bit extended.
+let hasSideEffects = 1, Defs = [CC] in
+  def RRBE : SideEffectBinaryRRE<"rrbe", 0xB22A, GR32, GR64>;
+
+// Reset reference bits multiple.
+let Predicates = [FeatureResetReferenceBitsMultiple], hasSideEffects = 1 in
+  def RRBM : UnaryRRE<"rrbm", 0xB9AE, null_frag, GR64, GR64>;
+
+// Perform frame management function.
+let hasSideEffects = 1 in
+  def PFMF : SideEffectBinaryMemRRE<"pfmf", 0xB9AF, GR32, GR64>;
+
+// Test block.
+let hasSideEffects = 1, mayStore = 1, Uses = [R0D], Defs = [R0D, CC] in
+  def TB : SideEffectBinaryRRE<"tb", 0xB22C, GR64, GR64>;
+
+// Page in / out.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  def PGIN : SideEffectBinaryRRE<"pgin", 0xB22E, GR64, GR64>;
+  def PGOUT : SideEffectBinaryRRE<"pgout", 0xB22F, GR64, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Dynamic-Address-Translation Instructions.
+//===----------------------------------------------------------------------===//
+
+// Invalidate page table entry.
+let hasSideEffects = 1 in
+  defm IPTE : SideEffectQuaternaryRRFaOptOpt<"ipte", 0xB221, GR64, GR32, GR32>;
+
+// Invalidate DAT table entry.
+let hasSideEffects = 1 in
+  defm IDTE : SideEffectQuaternaryRRFbOpt<"idte", 0xB98E, GR64, GR64, GR64>;
+
+// Compare and replace DAT table entry.
+let Predicates = [FeatureEnhancedDAT2], hasSideEffects = 1, Defs = [CC] in
+  defm CRDTE : SideEffectQuaternaryRRFbOpt<"crdte", 0xB98F, GR128, GR128, GR64>;
+
+// Purge TLB.
+let hasSideEffects = 1 in
+  def PTLB : SideEffectInherentS<"ptlb", 0xB20D, null_frag>;
+
+// Compare and swap and purge.
+let hasSideEffects = 1, Defs = [CC] in {
+  def CSP : CmpSwapRRE<"csp", 0xB250, GR128, GR64>;
+  def CSPG : CmpSwapRRE<"cspg", 0xB98A, GR128, GR64>;
+}
+
+// Load page-table-entry address.
+let hasSideEffects = 1, Defs = [CC] in
+  def LPTEA : TernaryRRFb<"lptea", 0xB9AA, GR64, GR64, GR64>;
+
+// Load real address.
+let hasSideEffects = 1, Defs = [CC] in {
+  defm LRA : LoadAddressRXPair<"lra", 0xB1, 0xE313, null_frag>;
+  def LRAG : LoadAddressRXY<"lrag", 0xE303, null_frag, laaddr20pair>;
+}
+
+// Store real address.
+def STRAG : StoreSSE<"strag", 0xE502>;
+
+// Load using real address.
+let mayLoad = 1 in {
+ def LURA : UnaryRRE<"lura", 0xB24B, null_frag, GR32, GR64>;
+ def LURAG : UnaryRRE<"lurag", 0xB905, null_frag, GR64, GR64>;
+}
+
+// Store using real address.
+let mayStore = 1 in {
+ def STURA : SideEffectBinaryRRE<"stura", 0xB246, GR32, GR64>;
+ def STURG : SideEffectBinaryRRE<"sturg", 0xB925, GR64, GR64>;
+}
+
+// Test protection.
+let hasSideEffects = 1, Defs = [CC] in
+  def TPROT : SideEffectBinarySSE<"tprot", 0xE501>;
+
+//===----------------------------------------------------------------------===//
+// Memory-move Instructions.
+//===----------------------------------------------------------------------===//
+
+// Move with key.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in
+  def MVCK : MemoryBinarySSd<"mvck", 0xD9, GR64>;
+
+// Move to primary / secondary.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  def MVCP : MemoryBinarySSd<"mvcp", 0xDA, GR64>;
+  def MVCS : MemoryBinarySSd<"mvcs", 0xDB, GR64>;
+}
+
+// Move with source / destination key.
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1L] in {
+  def MVCSK : SideEffectBinarySSE<"mvcsk", 0xE50E>;
+  def MVCDK : SideEffectBinarySSE<"mvcdk", 0xE50F>;
+}
+
+// Move with optional specifications.
+let mayLoad = 1, mayStore = 1, Uses = [R0L] in
+  def MVCOS : SideEffectTernarySSF<"mvcos", 0xC80, GR64>;
+
+// Move page.
+let mayLoad = 1, mayStore = 1, Uses = [R0L], Defs = [CC] in
+  def MVPG : SideEffectBinaryRRE<"mvpg", 0xB254, GR64, GR64>;
+
+//===----------------------------------------------------------------------===//
+// Address-Space Instructions.
+//===----------------------------------------------------------------------===//
+
+// Load address space parameters.
+let hasSideEffects = 1, Defs = [CC] in
+  def LASP : SideEffectBinarySSE<"lasp", 0xE500>;
+
+// Purge ALB.
+let hasSideEffects = 1 in
+  def PALB : SideEffectInherentRRE<"palb", 0xB248>;
+
+// Program call.
+let hasSideEffects = 1 in
+  def PC : SideEffectAddressS<"pc", 0xB218, null_frag>;
+
+// Program return.
+let hasSideEffects = 1, Defs = [CC] in
+  def PR : SideEffectInherentE<"pr", 0x0101>;
+
+// Program transfer (with instance).
+let hasSideEffects = 1 in {
+  def PT : SideEffectBinaryRRE<"pt", 0xB228, GR32, GR64>;
+  def PTI : SideEffectBinaryRRE<"pti", 0xB99E, GR64, GR64>;
+}
+
+// Resume program.
+let hasSideEffects = 1, Defs = [CC] in
+  def RP : SideEffectAddressS<"rp", 0xB277, null_frag>;
+
+// Branch in subspace group.
+let hasSideEffects = 1 in
+  def BSG : UnaryRRE<"bsg", 0xB258, null_frag, GR64, GR64>;
+
+// Branch and set authority.
+let hasSideEffects = 1 in
+  def BSA : UnaryRRE<"bsa", 0xB25A, null_frag, GR64, GR64>;
+
+// Test access.
+let Defs = [CC] in
+  def TAR : SideEffectBinaryRRE<"tar", 0xB24C, AR32, GR32>;
+
+//===----------------------------------------------------------------------===//
+// Linkage-Stack Instructions.
+//===----------------------------------------------------------------------===//
+
+// Branch and stack.
+let hasSideEffects = 1 in
+  def BAKR : SideEffectBinaryRRE<"bakr", 0xB240, GR64, GR64>;
+
+// Extract stacked registers.
+let hasSideEffects = 1 in {
+  def EREG : SideEffectBinaryRRE<"ereg", 0xB249, GR32, GR32>;
+  def EREGG : SideEffectBinaryRRE<"eregg", 0xB90E, GR64, GR64>;
+}
+
+// Extract stacked state.
+let hasSideEffects = 1, Defs = [CC] in
+  def ESTA : UnaryRRE<"esta", 0xB24A, null_frag, GR128, GR32>;
+
+// Modify stacked state.
+let hasSideEffects = 1 in
+  def MSTA : SideEffectUnaryRRE<"msta", 0xB247, GR128, null_frag>;
+
+//===----------------------------------------------------------------------===//
+// Time-Related Instructions.
+//===----------------------------------------------------------------------===//
+
+// Perform timing facility function.
+let hasSideEffects = 1, mayLoad = 1, Uses = [R0L, R1D], Defs = [CC] in
+  def PTFF : SideEffectInherentE<"ptff", 0x0104>;
+
+// Set clock.
+let hasSideEffects = 1, Defs = [CC] in
+  def SCK : SideEffectUnaryS<"sck", 0xB204, null_frag, 8>;
+
+// Set clock programmable field.
+let hasSideEffects = 1, Uses = [R0L] in
+  def SCKPF : SideEffectInherentE<"sckpf", 0x0107>;
+
+// Set clock comparator.
+let hasSideEffects = 1 in
+  def SCKC : SideEffectUnaryS<"sckc", 0xB206, null_frag, 8>;
+
+// Set CPU timer.
+let hasSideEffects = 1 in
+  def SPT : SideEffectUnaryS<"spt", 0xB208, null_frag, 8>;
+
+// Store clock (fast / extended).
+let hasSideEffects = 1, Defs = [CC] in {
+  def STCK  : StoreInherentS<"stck",  0xB205, null_frag, 8>;
+  def STCKF : StoreInherentS<"stckf", 0xB27C, null_frag, 8>;
+  def STCKE : StoreInherentS<"stcke", 0xB278, null_frag, 16>;
+}
+
+// Store clock comparator.
+let hasSideEffects = 1 in
+  def STCKC : StoreInherentS<"stckc", 0xB207, null_frag, 8>;
+
+// Store CPU timer.
+let hasSideEffects = 1 in
+  def STPT : StoreInherentS<"stpt", 0xB209, null_frag, 8>;
+
+//===----------------------------------------------------------------------===//
+// CPU-Related Instructions.
+//===----------------------------------------------------------------------===//
+
+// Store CPU address.
+let hasSideEffects = 1 in
+  def STAP : StoreInherentS<"stap", 0xB212, null_frag, 2>;
+
+// Store CPU ID.
+let hasSideEffects = 1 in
+  def STIDP : StoreInherentS<"stidp", 0xB202, null_frag, 8>;
+
+// Store system information.
+let hasSideEffects = 1, Uses = [R0L, R1L], Defs = [R0L, CC] in
+  def STSI : StoreInherentS<"stsi", 0xB27D, null_frag, 0>;
+
+// Store facility list.
+let hasSideEffects = 1 in
+  def STFL : StoreInherentS<"stfl", 0xB2B1, null_frag, 4>;
+
+// Store facility list extended.
+let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
+  def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
+
+// Extract CPU attribute.
+let hasSideEffects = 1 in
+  def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>;
+
+// Extract CPU time.
+let hasSideEffects = 1, mayLoad = 1, Defs = [R0D, R1D] in
+  def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
+
+// Perform topology function.
+let hasSideEffects = 1 in
+  def PTF : UnaryTiedRRE<"ptf", 0xB9A2, GR64>;
+
+// Perform cryptographic key management operation.
+let Predicates = [FeatureMessageSecurityAssist3],
+    hasSideEffects = 1, Uses = [R0L, R1D] in
+  def PCKMO : SideEffectInherentRRE<"pckmo", 0xB928>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Supervisor call.
+let hasSideEffects = 1, isCall = 1, Defs = [CC] in
+  def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
+
+// Monitor call.
+let hasSideEffects = 1, isCall = 1 in
+  def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>;
+
+// Diagnose.
+let hasSideEffects = 1, isCall = 1 in
+  def DIAG : SideEffectTernaryRS<"diag", 0x83, GR32, GR32>;
+
+// Trace.
+let hasSideEffects = 1, mayLoad = 1 in {
+  def TRACE : SideEffectTernaryRS<"trace", 0x99, GR32, GR32>;
+  def TRACG : SideEffectTernaryRSY<"tracg", 0xEB0F, GR64, GR64>;
+}
+
+// Trap.
+let hasSideEffects = 1 in {
+  def TRAP2 : SideEffectInherentE<"trap2", 0x01FF>;
+  def TRAP4 : SideEffectAddressS<"trap4", 0xB2FF, null_frag>;
+}
+
+// Signal processor.
+let hasSideEffects = 1, Defs = [CC] in
+  def SIGP : SideEffectTernaryRS<"sigp", 0xAE, GR64, GR64>;
+
+// Signal adapter.
+let hasSideEffects = 1, Uses = [R0D, R1D, R2D, R3D], Defs = [CC] in
+  def SIGA : SideEffectAddressS<"siga", 0xB274, null_frag>;
+
+// Start interpretive execution.
+let hasSideEffects = 1, Defs = [CC] in
+  def SIE : SideEffectUnaryS<"sie", 0xB214, null_frag, 0>;
+
+//===----------------------------------------------------------------------===//
+// CPU-Measurement Facility Instructions (SA23-2260).
+//===----------------------------------------------------------------------===//
+
+// Load program parameter
+let hasSideEffects = 1 in
+  def LPP : SideEffectUnaryS<"lpp", 0xB280, null_frag, 8>;
+
+// Extract coprocessor-group address.
+let hasSideEffects = 1, Defs = [CC] in
+  def ECPGA : UnaryRRE<"ecpga", 0xB2ED, null_frag, GR32, GR64>;
+
+// Extract CPU counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def ECCTR : UnaryRRE<"ecctr", 0xB2E4, null_frag, GR64, GR64>;
+
+// Extract peripheral counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def EPCTR : UnaryRRE<"epctr", 0xB2E5, null_frag, GR64, GR64>;
+
+// Load CPU-counter-set controls.
+let hasSideEffects = 1, Defs = [CC] in
+  def LCCTL : SideEffectUnaryS<"lcctl", 0xB284, null_frag, 8>;
+
+// Load peripheral-counter-set controls.
+let hasSideEffects = 1, Defs = [CC] in
+  def LPCTL : SideEffectUnaryS<"lpctl", 0xB285, null_frag, 8>;
+
+// Load sampling controls.
+let hasSideEffects = 1, Defs = [CC] in
+  def LSCTL : SideEffectUnaryS<"lsctl", 0xB287, null_frag, 0>;
+
+// Query sampling information.
+let hasSideEffects = 1 in
+  def QSI : StoreInherentS<"qsi", 0xB286, null_frag, 0>;
+
+// Query counter information.
+let hasSideEffects = 1 in
+  def QCTRI : StoreInherentS<"qctri", 0xB28E, null_frag, 0>;
+
+// Set CPU counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def SCCTR : SideEffectBinaryRRE<"scctr", 0xB2E0, GR64, GR64>;
+
+// Set peripheral counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def SPCTR : SideEffectBinaryRRE<"spctr", 0xB2E1, GR64, GR64>;
+
+//===----------------------------------------------------------------------===//
+// I/O Instructions (Principles of Operation, Chapter 14).
+//===----------------------------------------------------------------------===//
+
+// Clear subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def CSCH : SideEffectInherentS<"csch", 0xB230, null_frag>;
+
+// Halt subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def HSCH : SideEffectInherentS<"hsch", 0xB231, null_frag>;
+
+// Modify subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def MSCH : SideEffectUnaryS<"msch", 0xB232, null_frag, 0>;
+
+// Resume subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def RSCH : SideEffectInherentS<"rsch", 0xB238, null_frag>;
+
+// Start subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def SSCH : SideEffectUnaryS<"ssch", 0xB233, null_frag, 0>;
+
+// Store subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def STSCH : StoreInherentS<"stsch", 0xB234, null_frag, 0>;
+
+// Test subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def TSCH : StoreInherentS<"tsch", 0xB235, null_frag, 0>;
+
+// Cancel subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def XSCH : SideEffectInherentS<"xsch", 0xB276, null_frag>;
+
+// Reset channel path.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def RCHP : SideEffectInherentS<"rchp", 0xB23B, null_frag>;
+
+// Set channel monitor.
+let hasSideEffects = 1, mayLoad = 1, Uses = [R1L, R2D] in
+  def SCHM : SideEffectInherentS<"schm", 0xB23C, null_frag>;
+
+// Store channel path status.
+let hasSideEffects = 1 in
+  def STCPS : StoreInherentS<"stcps", 0xB23A, null_frag, 0>;
+
+// Store channel report word.
+let hasSideEffects = 1, Defs = [CC] in
+  def STCRW : StoreInherentS<"stcrw", 0xB239, null_frag, 0>;
+
+// Test pending interruption.
+let hasSideEffects = 1, Defs = [CC] in
+  def TPI : StoreInherentS<"tpi", 0xB236, null_frag, 0>;
+
+// Set address limit.
+let hasSideEffects = 1, Uses = [R1L] in
+  def SAL : SideEffectInherentS<"sal", 0xB237, null_frag>;
+
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index 47d2f75cc11a..36809ea81dc1 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -304,3 +304,13 @@ foreach I = 0-15 in {
 defm AR32 : SystemZRegClass<"AR32", [i32], 32,
                             (add (sequence "A%u", 0, 15)), 0>;
 
+// Control registers.
+class CREG64<bits<16> num, string n> : SystemZReg<n> {
+  let HWEncoding = num;
+}
+foreach I = 0-15 in {
+  def C#I : CREG64<I, "c"#I>, DwarfRegNum<[!add(I, 32)]>;
+}
+defm CR64 : SystemZRegClass<"CR64", [i64], 64,
+                            (add (sequence "C%u", 0, 15)), 0>;
+
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index 5f5f2f690e58..adc9f2976f87 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -353,6 +353,9 @@ def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXa], (instregex "ALR(K)?$")>;
 def : InstRW<[FXa], (instregex "AR(K)?$")>;
+def : InstRW<[FXa], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXa], (instregex "ALSIH(N)?$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
 
 // Logical addition with carry
@@ -376,6 +379,8 @@ def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
 def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXa], (instregex "SLR(K)?$")>;
 def : InstRW<[FXa], (instregex "SR(K)?$")>;
+def : InstRW<[FXa], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
@@ -506,6 +511,8 @@ def : InstRW<[FXb], (instregex "CLIH$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXb], (instregex "CLR$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXb], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
@@ -701,38 +708,9 @@ def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
-// Move with key
-def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
-
-// Monitor call
-def : InstRW<[FXb], (instregex "MC$")>;
-
-// Extract CPU attribute
-def : InstRW<[FXb, Lat30], (instregex "ECAG$")>;
-
-// Extract CPU Time
-def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>;
-
-// Extract PSW
-def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
-
 // Execute
 def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
 
-// Program return
-def : InstRW<[FXb, Lat30], (instregex "PR$")>;
-
-// Inline assembly
-def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
-             (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
-             (instregex "STCKE$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "STFLE$")>;
-def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
-
-// Store real address
-def : InstRW<[FXb, LSU, Lat5], (instregex "STRAG$")>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
@@ -1364,5 +1342,162 @@ def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
 def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
 def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
 
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXa, Lat3], (instregex "IPK$")>;
+def : InstRW<[LSU], (instregex "SPKA$")>;
+def : InstRW<[LSU], (instregex "SSM$")>;
+def : InstRW<[FXb], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXa, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXb, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXb, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXb, Lat30], (instregex "TB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXb, Lat30], (instregex "PR$")>;
+def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXb, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
+             (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[LSU, LSU, FXb, Lat3], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXb], (instregex "MC$")>;
+def : InstRW<[FXb, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXb], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXb, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LPP$")>;
+def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXb, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXb, Lat30], (instregex "SAL$")>;
+
 }
 
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index 126eac2e2072..128049a09086 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -310,6 +310,9 @@ def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
+def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXU, FXU, Lat3], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
 // Logical addition with carry
@@ -333,6 +336,8 @@ def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXU], (instregex "SLR(K)?$")>;
 def : InstRW<[FXU], (instregex "SR(K)?$")>;
+def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXU, FXU, Lat3], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
@@ -468,6 +473,8 @@ def : InstRW<[FXU], (instregex "CLIH$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXU], (instregex "CLR$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXU, FXU, Lat3], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>;
@@ -634,37 +641,9 @@ def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
-// Move with key
-def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
-
-// Monitor call
-def : InstRW<[FXU], (instregex "MC$")>;
-
-// Extract CPU attribute
-def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
-
-// Extract CPU Time
-def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
-
-// Extract PSW
-def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
-
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
 
-// Program return
-def : InstRW<[FXU, Lat30], (instregex "PR$")>;
-
-// Inline assembly
-def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>;
-def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>;
-def : InstRW<[LSU, FXU, Lat5], (instregex "STCKE$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>;
-def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-
-// Store real address
-def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
@@ -1058,5 +1037,160 @@ def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
 def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
 def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXU, Lat3], (instregex "IPK$")>;
+def : InstRW<[LSU], (instregex "SPKA$")>;
+def : InstRW<[LSU], (instregex "SSM$")>;
+def : InstRW<[FXU], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXU, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXU, Lat30], (instregex "TB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[LSU, Lat30], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXU, Lat30], (instregex "PR$")>;
+def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>;
+def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>;
+def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STCKE$")>;
+def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXU], (instregex "MC$")>;
+def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LPP$")>;
+def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXU, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXU, Lat30], (instregex "SAL$")>;
+
 }
 
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index d38ca64d2e9b..76b378454631 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -320,6 +320,9 @@ def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
+def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXU, Lat2], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
 // Logical addition with carry
@@ -343,6 +346,8 @@ def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXU], (instregex "SLR(K)?$")>;
 def : InstRW<[FXU], (instregex "SR(K)?$")>;
+def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXU, Lat2], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
@@ -478,6 +483,8 @@ def : InstRW<[FXU], (instregex "CLIH$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXU], (instregex "CLR$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXU, Lat2], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXU, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
@@ -672,37 +679,9 @@ def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
-// Move with key
-def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
-
-// Monitor call
-def : InstRW<[FXU], (instregex "MC$")>;
-
-// Extract CPU attribute
-def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
-
-// Extract CPU Time
-def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
-
-// Extract PSW
-def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
-
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
 
-// Program return
-def : InstRW<[FXU, Lat30], (instregex "PR$")>;
-
-// Inline assembly
-def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone],
-             (instregex "STCKE$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>;
-def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-
-// Store real address
-def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
@@ -1102,5 +1081,161 @@ def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
 def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
 def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXU, Lat3], (instregex "IPK$")>;
+def : InstRW<[LSU], (instregex "SPKA$")>;
+def : InstRW<[LSU], (instregex "SSM$")>;
+def : InstRW<[FXU], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXU, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXU, Lat30], (instregex "TB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[LSU, Lat30], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXU, Lat30], (instregex "PR$")>;
+def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>;
+def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXU], (instregex "MC$")>;
+def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LPP$")>;
+def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXU, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXU, Lat30], (instregex "SAL$")>;
+
 }
 
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index 0ab0c2f25915..eb4a0962f7eb 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -37,12 +37,13 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
                                    const TargetMachine &TM)
     : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
       HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
-      HasPopulationCount(false), HasMessageSecurityAssist4(false),
+      HasPopulationCount(false), HasMessageSecurityAssist3(false),
+      HasMessageSecurityAssist4(false), HasResetReferenceBitsMultiple(false),
       HasFastSerialization(false), HasInterlockedAccess1(false),
       HasMiscellaneousExtensions(false),
       HasExecutionHint(false), HasLoadAndTrap(false),
       HasTransactionalExecution(false), HasProcessorAssist(false),
-      HasDFPZonedConversion(false),
+      HasDFPZonedConversion(false), HasEnhancedDAT2(false),
       HasVector(false), HasLoadStoreOnCond2(false),
       HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
       HasDFPPackedConversion(false),
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index be480f03c572..b05a1bb6cafd 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -39,7 +39,9 @@ protected:
   bool HasHighWord;
   bool HasFPExtension;
   bool HasPopulationCount;
+  bool HasMessageSecurityAssist3;
   bool HasMessageSecurityAssist4;
+  bool HasResetReferenceBitsMultiple;
   bool HasFastSerialization;
   bool HasInterlockedAccess1;
   bool HasMiscellaneousExtensions;
@@ -48,6 +50,7 @@ protected:
   bool HasTransactionalExecution;
   bool HasProcessorAssist;
   bool HasDFPZonedConversion;
+  bool HasEnhancedDAT2;
   bool HasVector;
   bool HasLoadStoreOnCond2;
   bool HasLoadAndZeroRightmostByte;
@@ -109,9 +112,18 @@ public:
   bool hasPopulationCount() const { return HasPopulationCount; }
 
   // Return true if the target has the message-security-assist
+  // extension facility 3.
+  bool hasMessageSecurityAssist3() const { return HasMessageSecurityAssist3; }
+
+  // Return true if the target has the message-security-assist
   // extension facility 4.
   bool hasMessageSecurityAssist4() const { return HasMessageSecurityAssist4; }
 
+  // Return true if the target has the reset-reference-bits-multiple facility.
+  bool hasResetReferenceBitsMultiple() const {
+    return HasResetReferenceBitsMultiple;
+  }
+
   // Return true if the target has the fast-serialization facility.
   bool hasFastSerialization() const { return HasFastSerialization; }
 
@@ -138,6 +150,9 @@ public:
   // Return true if the target has the DFP zoned-conversion facility.
   bool hasDFPZonedConversion() const { return HasDFPZonedConversion; }
 
+  // Return true if the target has the enhanced-DAT facility 2.
+  bool hasEnhancedDAT2() const { return HasEnhancedDAT2; }
+
   // Return true if the target has the load-and-zero-rightmost-byte facility.
   bool hasLoadAndZeroRightmostByte() const {
     return HasLoadAndZeroRightmostByte;
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 422c16b8eb62..ce5c57e0f519 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -238,7 +238,7 @@ SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
-void SystemZTTIImpl::getUnrollingPreferences(Loop *L,
+void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP) {
   // Find out if L contains a call, what the machine instruction count
   // estimate is, and how many stores there are.
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index bdba7601eb78..6923fc6fc910 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -45,7 +45,8 @@ public:
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   /// @}
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 39cb1ca336f2..129794171464 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -57,17 +57,19 @@ def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
 }
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
-// Placemarkers to indicate the start or end of a block or loop scope. These
-// use/clobber VALUE_STACK to prevent them from being moved into the middle of
-// an expression tree.
+// Placemarkers to indicate the start or end of a block, loop, or try scope.
+// These use/clobber VALUE_STACK to prevent them from being moved into the
+// middle of an expression tree.
 let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
 def BLOCK     : I<(outs), (ins Signature:$sig), [], "block   \t$sig", 0x02>;
 def LOOP      : I<(outs), (ins Signature:$sig), [], "loop    \t$sig", 0x03>;
+def TRY       : I<(outs), (ins Signature:$sig), [], "try     \t$sig", 0x06>;
 
-// END_BLOCK, END_LOOP, and END_FUNCTION are represented with the same opcode
-// in wasm.
+// END_BLOCK, END_LOOP, END_TRY, and END_FUNCTION are represented with the same
+// opcode in wasm.
 def END_BLOCK : I<(outs), (ins), [], "end_block", 0x0b>;
 def END_LOOP  : I<(outs), (ins), [], "end_loop", 0x0b>;
+def END_TRY   : I<(outs), (ins), [], "end_try", 0x0b>;
 let isTerminator = 1, isBarrier = 1 in
 def END_FUNCTION : I<(outs), (ins), [], "end_function", 0x0b>;
 } // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
@@ -112,6 +114,20 @@ let isReturn = 1 in {
 
 def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable", 0x00>;
 
+def THROW_I32 : I<(outs), (ins i32imm:$tag, I32:$obj),
+                  [(int_wasm_throw imm:$tag, I32:$obj)], "throw   \t$tag, $obj",
+                  0x08>;
+def THROW_I64 : I<(outs), (ins i32imm:$tag, I64:$obj),
+                  [(int_wasm_throw imm:$tag, I64:$obj)], "throw   \t$tag, $obj",
+                  0x08>;
+def RETHROW : I<(outs), (ins i32imm:$rel_depth), [], "rethrow \t$rel_depth",
+                0x09>;
+
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
 } // Defs = [ARGUMENTS]
+
+// rethrow takes a relative depth as an argument, for which currently only 0 is
+// possible for C++. Once other languages need depths other than 0, depths will
+// be computed in CFGStackify.
+def : Pat<(int_wasm_rethrow), (RETHROW 0)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 947c0329bb6e..f0b6a3e35dba 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -897,7 +897,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
     }
   }
 
-  // Look for orphan landingpads, can occur in blocks with no predecesors
+  // Look for orphan landingpads, can occur in blocks with no predecessors
   for (BasicBlock &BB : F) {
     Instruction *I = BB.getFirstNonPHI();
     if (auto *LPI = dyn_cast<LandingPadInst>(I))
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d30cc724c203..825f23dc52d9 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -49,8 +49,11 @@ static const char OpPrecedence[] = {
   4, // IC_MINUS
   5, // IC_MULTIPLY
   5, // IC_DIVIDE
-  6, // IC_RPAREN
-  7, // IC_LPAREN
+  5, // IC_MOD
+  6, // IC_NOT
+  7, // IC_NEG
+  8, // IC_RPAREN
+  9, // IC_LPAREN
   0, // IC_IMM
   0  // IC_REGISTER
 };
@@ -92,6 +95,9 @@ private:
     IC_MINUS,
     IC_MULTIPLY,
     IC_DIVIDE,
+    IC_MOD,
+    IC_NOT,
+    IC_NEG,
     IC_RPAREN,
     IC_LPAREN,
     IC_IMM,
@@ -111,6 +117,10 @@ private:
     SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
     SmallVector<ICToken, 4> PostfixStack;
 
+    bool isUnaryOperator(const InfixCalculatorTok Op) {
+      return Op == IC_NEG || Op == IC_NOT;
+    }
+
   public:
     int64_t popOperand() {
       assert (!PostfixStack.empty() && "Poped an empty stack!");
@@ -192,6 +202,22 @@ private:
         ICToken Op = PostfixStack[i];
         if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
           OperandStack.push_back(Op);
+        } else if (isUnaryOperator(Op.first)) {
+          assert (OperandStack.size() > 0 && "Too few operands.");
+          ICToken Operand = OperandStack.pop_back_val();
+          assert (Operand.first == IC_IMM &&
+                  "Unary operation with a register!");
+          switch (Op.first) {
+          default:
+            report_fatal_error("Unexpected operator!");
+            break;
+          case IC_NEG:
+            OperandStack.push_back(std::make_pair(IC_IMM, -Operand.second));
+            break;
+          case IC_NOT:
+            OperandStack.push_back(std::make_pair(IC_IMM, ~Operand.second));
+            break;
+          }
         } else {
           assert (OperandStack.size() > 1 && "Too few operands.");
           int64_t Val;
@@ -222,6 +248,12 @@ private:
             Val = Op1.second / Op2.second;
             OperandStack.push_back(std::make_pair(IC_IMM, Val));
             break;
+          case IC_MOD:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Modulo operation with an immediate and a register!");
+            Val = Op1.second % Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
           case IC_OR:
             assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
                     "Or operation with an immediate and a register!");
@@ -271,6 +303,7 @@ private:
     IES_NOT,
     IES_MULTIPLY,
     IES_DIVIDE,
+    IES_MOD,
     IES_LBRAC,
     IES_RBRAC,
     IES_LPAREN,
@@ -421,10 +454,16 @@ private:
       default:
         State = IES_ERROR;
         break;
+      case IES_OR:
+      case IES_XOR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
       case IES_PLUS:
       case IES_NOT:
       case IES_MULTIPLY:
       case IES_DIVIDE:
+      case IES_MOD:
       case IES_LPAREN:
       case IES_RPAREN:
       case IES_LBRAC:
@@ -432,11 +471,12 @@ private:
       case IES_INTEGER:
       case IES_REGISTER:
         State = IES_MINUS;
-        // Only push the minus operator if it is not a unary operator.
-        if (!(CurrState == IES_PLUS || CurrState == IES_MINUS ||
-              CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE ||
-              CurrState == IES_LPAREN || CurrState == IES_LBRAC))
+        // push minus operator if it is not a negate operator
+        if (CurrState == IES_REGISTER || CurrState == IES_RPAREN ||
+            CurrState == IES_INTEGER  || CurrState == IES_RBRAC)
           IC.pushOperator(IC_MINUS);
+        else
+          IC.pushOperator(IC_NEG);
         if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
           // If we already have a BaseReg, then assume this is the IndexReg with
           // a scale of 1.
@@ -458,9 +498,21 @@ private:
       default:
         State = IES_ERROR;
         break;
+      case IES_OR:
+      case IES_XOR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
       case IES_PLUS:
+      case IES_MINUS:
       case IES_NOT:
+      case IES_MULTIPLY:
+      case IES_DIVIDE:
+      case IES_MOD:
+      case IES_LPAREN:
+      case IES_LBRAC:
         State = IES_NOT;
+        IC.pushOperator(IC_NOT);
         break;
       }
       PrevState = CurrState;
@@ -525,6 +577,7 @@ private:
       case IES_LSHIFT:
       case IES_RSHIFT:
       case IES_DIVIDE:
+      case IES_MOD:
       case IES_MULTIPLY:
       case IES_LPAREN:
         State = IES_INTEGER;
@@ -539,26 +592,6 @@ private:
           }
           // Get the scale and replace the 'Register * Scale' with '0'.
           IC.popOperator();
-        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
-                    PrevState == IES_OR || PrevState == IES_AND ||
-                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
-                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
-                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
-                    PrevState == IES_NOT || PrevState == IES_XOR) &&
-                   CurrState == IES_MINUS) {
-          // Unary minus.  No need to pop the minus operand because it was never
-          // pushed.
-          IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
-        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
-                    PrevState == IES_OR || PrevState == IES_AND ||
-                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
-                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
-                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
-                    PrevState == IES_NOT || PrevState == IES_XOR) &&
-                   CurrState == IES_NOT) {
-          // Unary not.  No need to pop the not operand because it was never
-          // pushed.
-          IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm.
         } else {
           IC.pushOperand(IC_IMM, TmpInt);
         }
@@ -594,6 +627,19 @@ private:
         break;
       }
     }
+    void onMod() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+        State = IES_MOD;
+        IC.pushOperator(IC_MOD);
+        break;
+      }
+    }
     void onLBrac() {
       PrevState = State;
       switch (State) {
@@ -647,18 +693,8 @@ private:
       case IES_RSHIFT:
       case IES_MULTIPLY:
       case IES_DIVIDE:
+      case IES_MOD:
       case IES_LPAREN:
-        // FIXME: We don't handle this type of unary minus or not, yet.
-        if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
-            PrevState == IES_OR || PrevState == IES_AND ||
-            PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
-            PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
-            PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
-            PrevState == IES_NOT || PrevState == IES_XOR) &&
-            (CurrState == IES_MINUS || CurrState == IES_NOT)) {
-          State = IES_ERROR;
-          break;
-        }
         State = IES_LPAREN;
         IC.pushOperator(IC_LPAREN);
         break;
@@ -1302,6 +1338,8 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine
     SM.onXor();
   else if (Name.equals_lower("and"))
     SM.onAnd();
+  else if (Name.equals_lower("mod"))
+    SM.onMod();
   else
     return false;
   return true;
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index caf98bffb80d..8f2017e990c5 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -396,7 +396,7 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
     if (!SB->getFragment()) {
       Asm.getContext().reportError(
           Fixup.getLoc(),
-          "symbol '" + B->getSymbol().getName() +
+          "symbol '" + SB->getName() +
               "' can not be undefined in a subtraction expression");
       return false;
     }
@@ -408,7 +408,7 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
     // pedantic compatibility with 'as'.
     Type = A->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF
                            : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
-    Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+    Value2 = Writer->getSymbolAddress(*SB, Layout);
     FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
   }
 
@@ -468,8 +468,8 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
                                                const MCFixup &Fixup,
                                                MCValue Target,
                                                uint64_t &FixedValue) {
-  assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
-         !is64Bit() &&
+  const MCSymbolRefExpr *SymA = Target.getSymA();
+  assert(SymA->getKind() == MCSymbolRefExpr::VK_TLVP && !is64Bit() &&
          "Should only be called with a 32-bit TLVP relocation!");
 
   unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
@@ -480,15 +480,14 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
   // subtraction from the picbase. For 32-bit pic the addend is the difference
   // between the picbase and the next address.  For 32-bit static the addend is
   // zero.
-  if (Target.getSymB()) {
+  if (auto *SymB = Target.getSymB()) {
     // If this is a subtraction then we're pcrel.
     uint32_t FixupAddress =
       Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
     IsPCRel = 1;
-    FixedValue =
-        FixupAddress -
-        Writer->getSymbolAddress(Target.getSymB()->getSymbol(), Layout) +
-        Target.getConstant();
+    FixedValue = FixupAddress -
+                 Writer->getSymbolAddress(SymB->getSymbol(), Layout) +
+                 Target.getConstant();
     FixedValue += 1ULL << Log2Size;
   } else {
     FixedValue = 0;
@@ -499,8 +498,7 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
   MRE.r_word0 = Value;
   MRE.r_word1 =
       (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28);
-  Writer->addRelocation(&Target.getSymA()->getSymbol(), Fragment->getParent(),
-                        MRE);
+  Writer->addRelocation(&SymA->getSymbol(), Fragment->getParent(), MRE);
 }
 
 void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 5892f1de33ee..807f7a6ddb19 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -44,7 +44,7 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
                                               const MCAsmBackend &MAB) const {
   unsigned FixupKind = Fixup.getKind();
   if (IsCrossSection) {
-    if (FixupKind != FK_Data_4) {
+    if (FixupKind != FK_Data_4 && FixupKind != llvm::X86::reloc_signed_4byte) {
       Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression");
       return COFF::IMAGE_REL_AMD64_ADDR32;
     }
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index fe105298f5c1..7437ebacfac3 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -300,6 +300,8 @@ def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
                     "Intel Atom processors">;
 def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
                     "Intel Silvermont processors">;
+def ProcIntelGLM  : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM",
+                    "Intel Goldmont processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
@@ -430,6 +432,34 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
 def : SilvermontProc<"silvermont">;
 def : SilvermontProc<"slm">; // Legacy alias.
 
+class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [
+  ProcIntelGLM,
+  FeatureX87,
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureMOVBE,
+  FeaturePOPCNT,
+  FeaturePCLMUL,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeatureCallRegIndirect,
+  FeatureSlowLEA,
+  FeatureSlowIncDec,
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF,
+  FeatureMPX,
+  FeatureSHA,
+  FeatureRDSEED,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureXSAVEC,
+  FeatureXSAVES,
+  FeatureCLFLUSHOPT
+]>;
+def : GoldmontProc<"goldmont">;
+
 // "Arrandale" along with corei3 and corei5
 class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureX87,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f777e5628988..b89914f8893e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5065,6 +5065,20 @@ static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 }
 
+// Return true if the instruction zeroes the unused upper part of the
+// destination and accepts mask.
+static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86ISD::PCMPEQM:
+  case X86ISD::PCMPGTM:
+  case X86ISD::CMPM:
+  case X86ISD::CMPMU:
+    return true;
+  }
+}
+
 /// Insert i1-subvector to i1-vector.
 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget) {
@@ -5097,6 +5111,22 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   // 3. Subvector should be inserted in the middle (for example v2i1
   //    to v16i1, index 2)
 
+  // If this node widens - by concatenating zeroes - the type of the result
+  // of a node with instruction that zeroes all upper (irrelevant) bits of the
+  // output register, mark this node as legal to enable replacing them with
+  // the v8i1 version of the previous instruction during instruction selection.
+  // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
+  // while zeroing all the upper remaining 60 bits of the register. if the
+  // result of such instruction is inserted into an allZeroVector, then we can
+  // safely remove insert_vector (in instruction selection) as the cmp instr
+  // already zeroed the rest of the register.
+  if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
+      (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
+       (SubVec.getOpcode() == ISD::AND &&
+        (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
+         isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
+    return Op;
+
   // extend to natively supported kshift
   MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
   MVT WideOpVT = OpVT;
@@ -7919,6 +7949,60 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 }
 
+// Return true if all the operands of the given CONCAT_VECTORS node are zeros
+// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
+static bool isExpandWithZeros(const SDValue &Op) {
+  assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
+         "Expand with zeros only possible in CONCAT_VECTORS nodes!");
+
+  for (unsigned i = 1; i < Op.getNumOperands(); i++)
+    if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
+      return false;
+
+  return true;
+}
+
+// Returns true if the given node is a type promotion (by concatenating i1
+// zeros) of the result of a node that already zeros all upper bits of
+// k-register.
+static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
+  unsigned Opc = Op.getOpcode();
+
+  assert(Opc == ISD::CONCAT_VECTORS &&
+         Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+         "Unexpected node to check for type promotion!");
+
+  // As long as we are concatenating zeros to the upper part of a previous node
+  // result, climb up the tree until a node with different opcode is
+  // encountered
+  while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
+    if (Opc == ISD::INSERT_SUBVECTOR) {
+      if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
+          Op.getConstantOperandVal(2) == 0)
+        Op = Op.getOperand(1);
+      else
+        return SDValue();
+    } else { // Opc == ISD::CONCAT_VECTORS
+      if (isExpandWithZeros(Op))
+        Op = Op.getOperand(0);
+      else
+        return SDValue();
+    }
+    Opc = Op.getOpcode();
+  }
+
+  // Check if the first inserted node zeroes the upper bits, or an 'and' result
+  // of a node that zeros the upper bits (its masked version).
+  if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
+      (Op.getOpcode() == ISD::AND &&
+       (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
+        isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
+    return Op;
+  }
+
+  return SDValue();
+}
+
 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG & DAG) {
@@ -7929,6 +8013,17 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   assert(isPowerOf2_32(NumOfOperands) &&
          "Unexpected number of operands in CONCAT_VECTORS");
 
+  // If this node promotes - by concatenating zeroes - the type of the result
+  // of a node with instruction that zeroes all upper (irrelevant) bits of the
+  // output register, mark it as legal and catch the pattern in instruction
+  // selection to avoid emitting extra insturctions (for zeroing upper bits).
+  if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
+    SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
+    SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
+                       ZeroC);
+  }
+
   SDValue Undef = DAG.getUNDEF(ResVT);
   if (NumOfOperands > 2) {
     // Specialize the cases when all, or all but one, of the operands are undef.
@@ -27012,6 +27107,9 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                            unsigned &Shuffle, MVT &ShuffleVT,
                                            unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
+  unsigned InputSizeInBits = MaskVT.getSizeInBits();
+  unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
+  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
 
   bool ContainsZeros = false;
   APInt Zeroable(NumMaskElts, false);
@@ -27027,7 +27125,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
     int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
-                                             MaskVT.getScalarSizeInBits(), Mask,
+                                             MaskScalarSizeInBits, Mask,
                                              0, Zeroable, Subtarget);
     if (0 < ShiftAmt) {
       PermuteImm = (unsigned)ShiftAmt;
@@ -27043,10 +27141,6 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                         return SM_SentinelUndef <= M && M < (int)NumMaskElts;
                       }) && "Expected unary shuffle");
 
-  unsigned InputSizeInBits = MaskVT.getSizeInBits();
-  unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
-  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
-
   // Handle PSHUFLW/PSHUFHW repeated patterns.
   if (MaskScalarSizeInBits == 16) {
     SmallVector<int, 4> RepeatedMask;
@@ -35072,7 +35166,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
 /// that is commonly recognized as an idiom (has no register dependency), so
 /// that's better/smaller than loading a splat 1 constant.
 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
-  assert(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB &&
+  assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
          "Unexpected opcode for increment/decrement transform");
 
   // Pseudo-legality check: getOnesVector() expects one of these types, so bail
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 01a70323224c..cc5c09cbf0e5 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -185,6 +185,20 @@ def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
 def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
                                              v2f64x_info>;
 
+class X86KVectorVTInfo<RegisterClass _krc, RegisterClass _krcwm,
+                       ValueType _vt> {
+  RegisterClass KRC = _krc;
+  RegisterClass KRCWM = _krcwm;
+  ValueType KVT = _vt;
+}
+
+def v2i1_info : X86KVectorVTInfo<VK2, VK2WM, v2i1>;
+def v4i1_info : X86KVectorVTInfo<VK4, VK4WM, v4i1>;
+def v8i1_info : X86KVectorVTInfo<VK8, VK8WM, v8i1>;
+def v16i1_info : X86KVectorVTInfo<VK16, VK16WM, v16i1>;
+def v32i1_info : X86KVectorVTInfo<VK32, VK32WM, v32i1>;
+def v64i1_info : X86KVectorVTInfo<VK64, VK64WM, v64i1>;
+
 // This multiclass generates the masking variants from the non-masking
 // variant.  It only provides the assembly pieces for the masking variants.
 // It assumes custom ISel patterns for masking which can be provided as
@@ -1735,17 +1749,217 @@ defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
                       avx512vl_i64_info, HasAVX512>,
                 T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-let Predicates = [HasAVX512, NoVLX] in {
-def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
-            (COPY_TO_REGCLASS (VPCMPGTDZrr
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
 
-def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
-            (COPY_TO_REGCLASS (VPCMPEQDZrr
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
-}
+multiclass avx512_icmp_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                       SDNode OpNode, string InstrStr,
+                                       list<Predicate> Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rr) _.RC:$src1, _.RC:$src2),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                                             (_.VT (bitconvert (_.LdFrag addr:$src2))))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rm) _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and _.KRCWM:$mask, 
+                                          (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrk) _.KRCWM:$mask,
+                                                                _.RC:$src1, _.RC:$src2),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask), 
+                                          (_.KVT (OpNode (_.VT _.RC:$src1), 
+                                                         (_.VT (bitconvert 
+                                                                (_.LdFrag addr:$src2))))))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask, 
+                                                                 _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+}
+}
+
+multiclass avx512_icmp_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                           SDNode OpNode, string InstrStr,
+                                           list<Predicate> Preds>
+         : avx512_icmp_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                            (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmb) _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (X86VBroadcast
+                                                          (_.ScalarLdFrag addr:$src2)))))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbk) _.KRCWM:$mask,
+                                                                  _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+}
+}
+
+// VPCMPEQB - i8
+defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpeqm,
+                                   "VPCMPEQBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPEQW - i16
+defm : avx512_icmp_packed_lowering<v8i16x_info, v16i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info, v32i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i16_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ", [HasBWI]>;
+
+// VPCMPEQD - i32
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v8i1_info,  X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ", [HasAVX512]>;
+
+// VPCMPEQQ - i64
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info,  X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info,  X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info,  X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm, 
+                                       "VPCMPEQQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ", [HasAVX512]>;
+
+// VPCMPGTB - i8
+defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpgtm,
+                                   "VPCMPGTBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPGTW - i16
+defm : avx512_icmp_packed_lowering<v8i16x_info,  v16i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info,  v32i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info,  v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i16_info,  v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ", [HasBWI]>;
+
+// VPCMPGTD - i32
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v8i1_info,  X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info,  v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info,  v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info,  v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ", [HasAVX512]>;
+
+// VPCMPGTQ - i64
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info,  X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info,  X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info,  X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ", [HasAVX512]>;
 
 multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
                           X86VectorVTInfo _> {
@@ -1908,6 +2122,237 @@ defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info,
 defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info,
                                      HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
 
+multiclass avx512_icmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                          SDNode OpNode, string InstrStr,
+                                          list<Predicate> Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                                             (_.VT _.RC:$src2), 
+                                             imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, 
+                                                                 _.RC:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                                             (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                             imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1,
+                                                                 addr:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and _.KRCWM:$mask, 
+                                          (OpNode (_.VT _.RC:$src1),
+                                                  (_.VT _.RC:$src2),
+                                                  imm:$cc))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask,
+                                                                  _.RC:$src1, 
+                                                                  _.RC:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask), 
+                                          (_.KVT (OpNode (_.VT _.RC:$src1), 
+                                                         (_.VT (bitconvert 
+                                                                (_.LdFrag addr:$src2))),
+                                                         imm:$cc)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask, 
+                                                                  _.RC:$src1,
+                                                                  addr:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+}
+}
+  
+multiclass avx512_icmp_cc_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                              SDNode OpNode, string InstrStr,
+                                              list<Predicate> Preds> 
+         : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                             (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                                             imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmib) _.RC:$src1,
+                                                                  addr:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (X86VBroadcast
+                                                            (_.ScalarLdFrag addr:$src2)),
+                                                         imm:$cc)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmibk) _.KRCWM:$mask,
+                                                                   _.RC:$src1,
+                                                                   addr:$src2,
+                                                                   imm:$cc),
+                              NewInf.KRC)>;
+}
+}
+
+// VPCMPB - i8
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpm,
+                                      "VPCMPBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpm,
+                                      "VPCMPBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpm,
+                                      "VPCMPBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPW - i16
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v16i1_info, X86cmpm,
+                                      "VPCMPWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v32i1_info, X86cmpm,
+                                      "VPCMPWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v64i1_info, X86cmpm,
+                                      "VPCMPWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpm,
+                                      "VPCMPWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpm,
+                                      "VPCMPWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpm,
+                                      "VPCMPWZ", [HasBWI]>;
+
+// VPCMPD - i32
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v8i1_info,   X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v16i1_info,  X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v32i1_info,  X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v64i1_info,  X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v16i1_info, X86cmpm,
+                                          "VPCMPDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v32i1_info, X86cmpm,
+                                          "VPCMPDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v64i1_info, X86cmpm,
+                                          "VPCMPDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpm,
+                                          "VPCMPDZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpm,
+                                          "VPCMPDZ", [HasAVX512]>;
+
+// VPCMPQ - i64
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info,   X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info,   X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info,  X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info,  X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info,  X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info,   X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info,  X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info,  X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info,  X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpm,
+                                          "VPCMPQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpm,
+                                          "VPCMPQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpm,
+                                          "VPCMPQZ", [HasAVX512]>;
+
+// VPCMPUB - i8
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpmu,
+                                      "VPCMPUBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPUW - i16
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v16i1_info, X86cmpmu,
+                                      "VPCMPUWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v32i1_info, X86cmpmu,
+                                      "VPCMPUWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v64i1_info, X86cmpmu,
+                                      "VPCMPUWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpmu,
+                                      "VPCMPUWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUWZ", [HasBWI]>;
+
+// VPCMPUD - i32
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v8i1_info,   X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v16i1_info,  X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v32i1_info,  X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v64i1_info,  X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v16i1_info, X86cmpmu,
+                                          "VPCMPUDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v32i1_info, X86cmpmu,
+                                          "VPCMPUDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v64i1_info, X86cmpmu,
+                                          "VPCMPUDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpmu,
+                                          "VPCMPUDZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpmu,
+                                          "VPCMPUDZ", [HasAVX512]>;
+
+// VPCMPUQ - i64
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info,   X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info,   X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info,  X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info,  X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info,  X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info,   X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info,  X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info,  X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info,  X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpmu,
+                                          "VPCMPUQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpmu,
+                                          "VPCMPUQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpmu,
+                                          "VPCMPUQZ", [HasAVX512]>;
+
 multiclass avx512_vcmp_common<X86VectorVTInfo _> {
 
   defm  rri  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
@@ -1998,21 +2443,108 @@ defm VCMPPD : avx512_vcmp<avx512vl_f64_info>,
 defm VCMPPS : avx512_vcmp<avx512vl_f32_info>,
                           AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
 
-def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
-          (COPY_TO_REGCLASS (VCMPPSZrri
-            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            imm:$cc), VK8)>;
-def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
-          (COPY_TO_REGCLASS (VPCMPDZrri
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            imm:$cc), VK8)>;
-def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
-          (COPY_TO_REGCLASS (VPCMPUDZrri
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            imm:$cc), VK8)>;
+multiclass avx512_fcmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                          string InstrStr, list<Predicate> Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1), 
+                                              (_.VT _.RC:$src2), 
+                                              imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, 
+                                                                 _.RC:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1), 
+                                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                              imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1,
+                                                                 addr:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
+                                              (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                                              imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbi) _.RC:$src1,
+                                                                  addr:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+}
+}
+  
+multiclass avx512_fcmp_cc_packed_sae_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                              string InstrStr, list<Predicate> Preds> 
+         : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> {
+
+let Predicates = Preds in
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), 
+                                                 (_.VT _.RC:$src2), 
+                                                 imm:$cc,
+                                                 (i32 FROUND_NO_EXC))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1, 
+                                                                 _.RC:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+}
+
+
+// VCMPPS - f32
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v8i1_info,  "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v16i1_info, "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v32i1_info, "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v64i1_info, "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_lowering<v8f32x_info,  v16i1_info, "VCMPPSZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v8f32x_info,  v32i1_info, "VCMPPSZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v8f32x_info,  v64i1_info, "VCMPPSZ256",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v32i1_info, "VCMPPSZ",
+                                          [HasAVX512]>;
+defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v64i1_info, "VCMPPSZ",
+                                          [HasAVX512]>;
+
+// VCMPPD - f64
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v4i1_info,  "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v8i1_info,  "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v16i1_info, "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v32i1_info, "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v64i1_info, "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v8i1_info,  "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v16i1_info, "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v32i1_info, "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v64i1_info, "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v16i1_info, "VCMPPDZ",
+                                          [HasAVX512]>;
+defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v32i1_info, "VCMPPDZ",
+                                          [HasAVX512]>;
+defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v64i1_info, "VCMPPDZ",
+                                          [HasAVX512]>;
 
 // ----------------------------------------------------------------
 // FPClass
@@ -2498,6 +3030,69 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
 defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>;
 defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>;
 
+multiclass axv512_icmp_packed_no_vlx_lowering<SDNode OpNode, string InstStr> {
+def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrr)
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+                            (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrr)
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+            (i8 8)), (i8 8))>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+                            (v8i1 (and VK8:$mask, 
+                                       (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrk)
+                     (COPY_TO_REGCLASS VK8:$mask, VK16),
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+            (i8 8)), (i8 8))>;
+}
+
+multiclass axv512_icmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
+                                                AVX512VLVectorVTInfo _> {
+def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrri)
+            (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+            (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+                            (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrri)
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+                     imm:$cc),
+            (i8 8)), (i8 8))>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+                            (v8i1 (and VK8:$mask, 
+                                       (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrik)
+                     (COPY_TO_REGCLASS VK8:$mask, VK16),
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+                     imm:$cc),
+            (i8 8)), (i8 8))>;
+}
+
+let Predicates = [HasAVX512, NoVLX] in {
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD">;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQD">;
+
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", avx512vl_f32_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPD", avx512vl_i32_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUD", avx512vl_i32_info>;
+}
+
 // Mask setting all 0s or 1s
 multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
   let Predicates = [HasAVX512] in
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index f98c2a7e802d..e34a90e975b8 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -75,6 +75,8 @@ private:
   bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI,
                    MachineFunction &MF) const;
   bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
+                         MachineFunction &MF) const;
   bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
                     MachineFunction &MF) const;
   bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -270,6 +272,8 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
     return true;
   if (selectUadde(I, MRI, MF))
     return true;
+  if (selectMergeValues(I, MRI, MF))
+    return true;
   if (selectExtract(I, MRI, MF))
     return true;
   if (selectInsert(I, MRI, MF))
@@ -914,6 +918,55 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I,
   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 }
 
+bool X86InstructionSelector::selectMergeValues(MachineInstr &I,
+                                               MachineRegisterInfo &MRI,
+                                               MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_MERGE_VALUES)
+    return false;
+
+  // Split to inserts.
+  unsigned DstReg = I.getOperand(0).getReg();
+  unsigned SrcReg0 = I.getOperand(1).getReg();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg0);
+  unsigned SrcSize = SrcTy.getSizeInBits();
+
+  const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+
+  // For the first src use insertSubReg.
+  unsigned DefReg = MRI.createGenericVirtualRegister(DstTy);
+  MRI.setRegBank(DefReg, RegBank);
+  if (!emitInsertSubreg(DefReg, I.getOperand(1).getReg(), I, MRI, MF))
+    return false;
+
+  for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) {
+
+    unsigned Tmp = MRI.createGenericVirtualRegister(DstTy);
+    MRI.setRegBank(Tmp, RegBank);
+
+    MachineInstr &InsertInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                        TII.get(TargetOpcode::G_INSERT), Tmp)
+                                    .addReg(DefReg)
+                                    .addReg(I.getOperand(Idx).getReg())
+                                    .addImm((Idx - 1) * SrcSize);
+
+    DefReg = Tmp;
+
+    if (!select(InsertInst))
+      return false;
+  }
+
+  MachineInstr &CopyInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                    TII.get(TargetOpcode::COPY), DstReg)
+                                .addReg(DefReg);
+
+  if (!select(CopyInst))
+    return false;
+
+  I.eraseFromParent();
+  return true;
+}
 InstructionSelector *
 llvm::createX86InstructionSelector(const X86TargetMachine &TM,
                                    X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index a584eabcc1b2..a5fa3340c3f1 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -56,7 +56,7 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
 
-  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL})
+  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
     for (auto Ty : {s8, s16, s32})
       setAction({BinOp, Ty}, Legal);
 
@@ -117,7 +117,7 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
 
-  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL})
+  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
     for (auto Ty : {s8, s16, s32, s64})
       setAction({BinOp, Ty}, Legal);
 
@@ -228,10 +228,14 @@ void X86LegalizerInfo::setLegalizerInfoAVX() {
     for (auto Ty : {v8s32, v4s64})
       setAction({MemOp, Ty}, Legal);
 
-  for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
+  for (auto Ty : {v32s8, v16s16, v8s32, v4s64}) {
     setAction({G_INSERT, Ty}, Legal);
-  for (auto Ty : {v16s8, v8s16, v4s32, v2s64})
+    setAction({G_EXTRACT, 1, Ty}, Legal);
+  }
+  for (auto Ty : {v16s8, v8s16, v4s32, v2s64}) {
     setAction({G_INSERT, 1, Ty}, Legal);
+    setAction({G_EXTRACT, Ty}, Legal);
+  }
 }
 
 void X86LegalizerInfo::setLegalizerInfoAVX2() {
@@ -280,10 +284,14 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
     for (auto Ty : {v16s32, v8s64})
       setAction({MemOp, Ty}, Legal);
 
-  for (auto Ty : {v64s8, v32s16, v16s32, v8s64})
+  for (auto Ty : {v64s8, v32s16, v16s32, v8s64}) {
     setAction({G_INSERT, Ty}, Legal);
-  for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64})
+    setAction({G_EXTRACT, 1, Ty}, Legal);
+  }
+  for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64}) {
     setAction({G_INSERT, 1, Ty}, Legal);
+    setAction({G_EXTRACT, Ty}, Legal);
+  }
 
   /************ VLX *******************/
   if (!Subtarget.hasVLX())
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index e36a47506ba0..24845beac22d 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -11,10 +11,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "X86.h"
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "X86CallLowering.h"
+#include "X86LegalizerInfo.h"
+#include "X86RegisterBankInfo.h"
+#endif
 #include "X86Subtarget.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/Triple.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#endif
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Function.h"
@@ -336,6 +349,35 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
   return *this;
 }
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct X86GISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+
+} // end anonymous namespace
+#endif
+
 X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
                            const X86TargetMachine &TM,
                            unsigned StackAlignOverride)
@@ -360,6 +402,19 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
     setPICStyle(PICStyles::StubPIC);
   else if (isTargetELF())
     setPICStyle(PICStyles::GOT);
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+  GISelAccessor *GISel = new GISelAccessor();
+#else
+  X86GISelActualAccessor *GISel = new X86GISelActualAccessor();
+
+  GISel->CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering()));
+  GISel->Legalizer.reset(new X86LegalizerInfo(*this, TM));
+
+  auto *RBI = new X86RegisterBankInfo(*getRegisterInfo());
+  GISel->RegBankInfo.reset(RBI);
+  GISel->InstSelector.reset(createX86InstructionSelector(TM, *this, *RBI));
+#endif
+  setGISelAccessor(*GISel);
 }
 
 const CallLowering *X86Subtarget::getCallLowering() const {
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 550e95c39ab5..fa0afe29586b 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -58,7 +58,7 @@ protected:
   };
 
   enum X86ProcFamilyEnum {
-    Others, IntelAtom, IntelSLM
+    Others, IntelAtom, IntelSLM, IntelGLM
   };
 
   /// X86 processor family: Intel Atom, and others
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index a9f42cacf788..8d891c983fab 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -15,9 +15,6 @@
 #include "X86.h"
 #include "X86CallLowering.h"
 #include "X86LegalizerInfo.h"
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-#include "X86RegisterBankInfo.h"
-#endif
 #include "X86MacroFusion.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
@@ -31,7 +28,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/ExecutionDepsFix.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
@@ -212,35 +208,6 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
 
 X86TargetMachine::~X86TargetMachine() = default;
 
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
-
-struct X86GISelActualAccessor : public GISelAccessor {
-  std::unique_ptr<CallLowering> CallLoweringInfo;
-  std::unique_ptr<LegalizerInfo> Legalizer;
-  std::unique_ptr<RegisterBankInfo> RegBankInfo;
-  std::unique_ptr<InstructionSelector> InstSelector;
-
-  const CallLowering *getCallLowering() const override {
-    return CallLoweringInfo.get();
-  }
-
-  const InstructionSelector *getInstructionSelector() const override {
-    return InstSelector.get();
-  }
-
-  const LegalizerInfo *getLegalizerInfo() const override {
-    return Legalizer.get();
-  }
-
-  const RegisterBankInfo *getRegBankInfo() const override {
-    return RegBankInfo.get();
-  }
-};
-
-} // end anonymous namespace
-#endif
-
 const X86Subtarget *
 X86TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -280,20 +247,6 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
     resetTargetOptions(F);
     I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
                                         Options.StackAlignmentOverride);
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-    GISelAccessor *GISel = new GISelAccessor();
-#else
-    X86GISelActualAccessor *GISel = new X86GISelActualAccessor();
-
-    GISel->CallLoweringInfo.reset(new X86CallLowering(*I->getTargetLowering()));
-    GISel->Legalizer.reset(new X86LegalizerInfo(*I, *this));
-
-    auto *RBI = new X86RegisterBankInfo(*I->getRegisterInfo());
-    GISel->RegBankInfo.reset(RBI);
-    GISel->InstSelector.reset(createX86InstructionSelector(
-        *this, *I, *RBI));
-#endif
-    I->setGISelAccessor(*GISel);
   }
   return I.get();
 }
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 1bf267d34ec2..aaa6d58bd134 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -40,6 +40,7 @@ public:
   ~X86TargetMachine() override;
 
   const X86Subtarget *getSubtargetImpl(const Function &F) const override;
+  const X86Subtarget *getSubtargetImpl() const = delete;
 
   TargetIRAnalysis getTargetIRAnalysis() override;
 
diff --git a/lib/Transforms/Coroutines/CoroInstr.h b/lib/Transforms/Coroutines/CoroInstr.h
index 5c666bdfea1f..9a8cc5a2591c 100644
--- a/lib/Transforms/Coroutines/CoroInstr.h
+++ b/lib/Transforms/Coroutines/CoroInstr.h
@@ -58,10 +58,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_subfn_addr;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -70,10 +70,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroAllocInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_alloc;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -175,10 +175,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_id;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -187,10 +187,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_frame;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -203,10 +203,10 @@ public:
   Value *getFrame() const { return getArgOperand(FrameArg); }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_free;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -221,10 +221,10 @@ public:
   Value *getMem() const { return getArgOperand(MemArg); }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_begin;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -233,10 +233,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroSaveInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_save;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -254,10 +254,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_promise;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -279,10 +279,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_suspend;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -291,10 +291,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_size;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -310,10 +310,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_end;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 087a8aa2c624..5b1b58b89c32 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -56,10 +56,6 @@ RunSLPVectorization("vectorize-slp", cl::Hidden,
                     cl::desc("Run the SLP vectorization passes"));
 
 static cl::opt<bool>
-RunBBVectorization("vectorize-slp-aggressive", cl::Hidden,
-                    cl::desc("Run the BB vectorization passes"));
-
-static cl::opt<bool>
 UseGVNAfterVectorization("use-gvn-after-vectorization",
   cl::init(false), cl::Hidden,
   cl::desc("Run GVN instead of Early CSE after vectorization passes"));
@@ -138,8 +134,8 @@ static cl::opt<int> PreInlineThreshold(
              "(default = 75)"));
 
 static cl::opt<bool> EnableEarlyCSEMemSSA(
-    "enable-earlycse-memssa", cl::init(false), cl::Hidden,
-    cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = off)"));
+    "enable-earlycse-memssa", cl::init(true), cl::Hidden,
+    cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = on)"));
 
 static cl::opt<bool> EnableGVNHoist(
     "enable-gvn-hoist", cl::init(false), cl::Hidden,
@@ -166,7 +162,6 @@ PassManagerBuilder::PassManagerBuilder() {
     Inliner = nullptr;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
-    BBVectorize = RunBBVectorization;
     SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
     RerollLoops = RunLoopRerolling;
@@ -263,11 +258,12 @@ void PassManagerBuilder::populateFunctionPassManager(
 
 // Do PGO instrumentation generation or use pass as the option specified.
 void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) {
-  if (!EnablePGOInstrGen && PGOInstrUse.empty())
+  if (!EnablePGOInstrGen && PGOInstrUse.empty() && PGOSampleUse.empty())
     return;
   // Perform the preinline and cleanup passes for O1 and above.
   // And avoid doing them if optimizing for size.
-  if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner) {
+  if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner &&
+      PGOSampleUse.empty()) {
     // Create preinline pass. We construct an InlineParams object and specify
     // the threshold here to avoid the command line options of the regular
     // inliner to influence pre-inlining. The only fields of InlineParams we
@@ -383,26 +379,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
 
   if (RerollLoops)
     MPM.add(createLoopRerollPass());
-  if (!RunSLPAfterLoopVectorization) {
-    if (SLPVectorize)
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      addInstructionCombiningPass(MPM);
-      addExtensionsToPM(EP_Peephole, MPM);
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(NewGVN
-                    ? createNewGVNPass()
-                    : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass(OptLevel));
-    }
-  }
+  if (!RunSLPAfterLoopVectorization && SLPVectorize)
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
@@ -634,28 +612,10 @@ void PassManagerBuilder::populateModulePassManager(
     addInstructionCombiningPass(MPM);
   }
 
-  if (RunSLPAfterLoopVectorization) {
-    if (SLPVectorize) {
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-      if (OptLevel > 1 && ExtraVectorizerPasses) {
-        MPM.add(createEarlyCSEPass());
-      }
-    }
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      addInstructionCombiningPass(MPM);
-      addExtensionsToPM(EP_Peephole, MPM);
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(NewGVN
-                    ? createNewGVNPass()
-                    : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass(OptLevel));
+  if (RunSLPAfterLoopVectorization && SLPVectorize) {
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+    if (OptLevel > 1 && ExtraVectorizerPasses) {
+      MPM.add(createEarlyCSEPass());
     }
   }
 
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 656421ee58df..ac4765f96075 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -1484,7 +1484,8 @@ bool SampleProfileLoader::runOnFunction(Function &F) {
 PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
                                                ModuleAnalysisManager &AM) {
 
-  SampleProfileLoader SampleLoader(SampleProfileFile);
+  SampleProfileLoader SampleLoader(
+      ProfileFileName.empty() ? SampleProfileFile : ProfileFileName);
 
   SampleLoader.doInitialization(M);
 
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 802f470ffe1f..8d494fe9cde2 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -371,6 +371,7 @@ void splitAndWriteThinLTOBitcode(
                 /*GenerateHash=*/true, &ModHash);
   W.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
                 &MergedMIndex);
+  W.writeSymtab();
   W.writeStrtab();
   OS << Buffer;
 
@@ -385,6 +386,7 @@ void splitAndWriteThinLTOBitcode(
                    /*GenerateHash=*/false, &ModHash);
     W2.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
                    &MergedMIndex);
+    W2.writeSymtab();
     W2.writeStrtab();
     *ThinLinkOS << Buffer;
   }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index d3d8cefe9735..db98be2c98f5 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2301,10 +2301,10 @@ static Instruction *foldXorToXor(BinaryOperator &I) {
   // (~B | A) ^ (~A | B) -> A ^ B
   // (~A | B) ^ (A | ~B) -> A ^ B
   // (B | ~A) ^ (A | ~B) -> A ^ B
-  if ((match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
-       match(Op1, m_Or(m_Not(m_Specific(A)), m_Specific(B)))) ||
-      (match(Op0, m_c_Or(m_Not(m_Value(A)), m_Value(B))) &&
-       match(Op1, m_Or(m_Specific(A), m_Not(m_Specific(B)))))) {
+  if ((match(Op0, m_Or(m_Value(A), m_Not(m_Value(B)))) &&
+       match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B)))) ||
+      (match(Op0, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
+       match(Op1, m_c_Or(m_Specific(A), m_Not(m_Specific(B)))))) {
     I.setOperand(0, A);
     I.setOperand(1, B);
     return &I;
@@ -2314,10 +2314,10 @@ static Instruction *foldXorToXor(BinaryOperator &I) {
   // (~B & A) ^ (~A & B) -> A ^ B
   // (~A & B) ^ (A & ~B) -> A ^ B
   // (B & ~A) ^ (A & ~B) -> A ^ B
-  if ((match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
-       match(Op1, m_And(m_Not(m_Specific(A)), m_Specific(B)))) ||
-      (match(Op0, m_c_And(m_Not(m_Value(A)), m_Value(B))) &&
-       match(Op1, m_And(m_Specific(A), m_Not(m_Specific(B)))))) {
+  if ((match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+       match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))) ||
+      (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
+       match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))))) {
     I.setOperand(0, A);
     I.setOperand(1, B);
     return &I;
@@ -2456,10 +2456,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     }
   }
 
-  // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
+  // not (cmp A, B) = !cmp A, B
   ICmpInst::Predicate Pred;
-  if (match(Op0, m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))) &&
-      match(Op1, m_AllOnes())) {
+  if (match(&I, m_Not(m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))))) {
     cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred));
     return replaceInstUsesWith(I, Op0);
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index dbed7ad4eae8..3770021de100 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1985,7 +1985,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *X = nullptr;
 
     // bitreverse(bitreverse(x)) -> x
-    if (match(IIOperand, m_Intrinsic<Intrinsic::bitreverse>(m_Value(X))))
+    if (match(IIOperand, m_BitReverse(m_Value(X))))
       return replaceInstUsesWith(CI, X);
     break;
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 6ad32490a328..58b8b2f52629 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -112,10 +112,10 @@ static bool subWithOverflow(Constant *&Result, Constant *In1,
 
 /// Given an icmp instruction, return true if any use of this comparison is a
 /// branch on sign bit comparison.
-static bool isBranchOnSignBitCheck(ICmpInst &I, bool isSignBit) {
+static bool hasBranchUse(ICmpInst &I) {
   for (auto *U : I.users())
     if (isa<BranchInst>(U))
-      return isSignBit;
+      return true;
   return false;
 }
 
@@ -1448,12 +1448,13 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
     // of a test and branch. So we avoid canonicalizing in such situations
     // because test and branch instruction has better branch displacement
     // than compare and branch instruction.
-    if (!isBranchOnSignBitCheck(Cmp, IsSignBit) && !Cmp.isEquality()) {
-      if (auto *AI = Intersection.getSingleElement())
-        return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder->getInt(*AI));
-      if (auto *AD = Difference.getSingleElement())
-        return new ICmpInst(ICmpInst::ICMP_NE, X, Builder->getInt(*AD));
-    }
+    if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp)))
+      return nullptr;
+
+    if (auto *AI = Intersection.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder->getInt(*AI));
+    if (auto *AD = Difference.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_NE, X, Builder->getInt(*AD));
   }
 
   return nullptr;
@@ -3301,12 +3302,12 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     return nullptr;
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  const CmpInst::Predicate Pred = I.getPredicate();
   Value *A, *B, *C, *D;
   if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
     if (A == Op1 || B == Op1) { // (A^B) == A  ->  B == 0
       Value *OtherVal = A == Op1 ? B : A;
-      return new ICmpInst(I.getPredicate(), OtherVal,
-                          Constant::getNullValue(A->getType()));
+      return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
     }
 
     if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
@@ -3316,26 +3317,25 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
           Op1->hasOneUse()) {
         Constant *NC = Builder->getInt(C1->getValue() ^ C2->getValue());
         Value *Xor = Builder->CreateXor(C, NC);
-        return new ICmpInst(I.getPredicate(), A, Xor);
+        return new ICmpInst(Pred, A, Xor);
       }
 
       // A^B == A^D -> B == D
       if (A == C)
-        return new ICmpInst(I.getPredicate(), B, D);
+        return new ICmpInst(Pred, B, D);
       if (A == D)
-        return new ICmpInst(I.getPredicate(), B, C);
+        return new ICmpInst(Pred, B, C);
       if (B == C)
-        return new ICmpInst(I.getPredicate(), A, D);
+        return new ICmpInst(Pred, A, D);
       if (B == D)
-        return new ICmpInst(I.getPredicate(), A, C);
+        return new ICmpInst(Pred, A, C);
     }
   }
 
   if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) {
     // A == (A^B)  ->  B == 0
     Value *OtherVal = A == Op0 ? B : A;
-    return new ICmpInst(I.getPredicate(), OtherVal,
-                        Constant::getNullValue(A->getType()));
+    return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
   }
 
   // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
@@ -3380,8 +3380,7 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     APInt Pow2 = Cst1->getValue() + 1;
     if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
         Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
-      return new ICmpInst(I.getPredicate(), A,
-                          Builder->CreateTrunc(B, A->getType()));
+      return new ICmpInst(Pred, A, Builder->CreateTrunc(B, A->getType()));
   }
 
   // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
@@ -3393,12 +3392,11 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     unsigned TypeBits = Cst1->getBitWidth();
     unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
     if (ShAmt < TypeBits && ShAmt != 0) {
-      ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_NE
-                                     ? ICmpInst::ICMP_UGE
-                                     : ICmpInst::ICMP_ULT;
+      ICmpInst::Predicate NewPred =
+          Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
       Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
       APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
-      return new ICmpInst(Pred, Xor, Builder->getInt(CmpVal));
+      return new ICmpInst(NewPred, Xor, Builder->getInt(CmpVal));
     }
   }
 
@@ -3412,8 +3410,7 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
       APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
       Value *And = Builder->CreateAnd(Xor, Builder->getInt(AndVal),
                                       I.getName() + ".mask");
-      return new ICmpInst(I.getPredicate(), And,
-                          Constant::getNullValue(Cst1->getType()));
+      return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType()));
     }
   }
 
@@ -3437,7 +3434,7 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
       CmpV <<= ShAmt;
 
       Value *Mask = Builder->CreateAnd(A, Builder->getInt(MaskV));
-      return new ICmpInst(I.getPredicate(), Mask, Builder->getInt(CmpV));
+      return new ICmpInst(Pred, Mask, Builder->getInt(CmpV));
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 1b0fe84dd4dd..87f11467b95e 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -131,11 +131,10 @@ static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
     return true;
 
   // A vector of constant integers can be inverted easily.
-  Constant *CV;
-  if (V->getType()->isVectorTy() && match(V, PatternMatch::m_Constant(CV))) {
+  if (V->getType()->isVectorTy() && isa<Constant>(V)) {
     unsigned NumElts = V->getType()->getVectorNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = CV->getAggregateElement(i);
+      Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
       if (!Elt)
         return false;
 
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index ca370c73fca4..26bee204e5a4 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -661,6 +661,9 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     if (NumElements == 1) {
       LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),
                                                ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      NewLoad->setAAMetadata(AAMD);
       return IC.replaceInstUsesWith(LI, IC.Builder->CreateInsertValue(
         UndefValue::get(T), NewLoad, 0, Name));
     }
@@ -690,6 +693,10 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
                                                 Name + ".elt");
       auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
       auto *L = IC.Builder->CreateAlignedLoad(Ptr, EltAlign, Name + ".unpack");
+      // Propagate AA metadata. It'll still be valid on the narrowed load.
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      L->setAAMetadata(AAMD);
       V = IC.Builder->CreateInsertValue(V, L, i);
     }
 
@@ -702,6 +709,9 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     auto NumElements = AT->getNumElements();
     if (NumElements == 1) {
       LoadInst *NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      NewLoad->setAAMetadata(AAMD);
       return IC.replaceInstUsesWith(LI, IC.Builder->CreateInsertValue(
         UndefValue::get(T), NewLoad, 0, Name));
     }
@@ -734,6 +744,9 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
                                                 Name + ".elt");
       auto *L = IC.Builder->CreateAlignedLoad(Ptr, MinAlign(Align, Offset),
                                               Name + ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      L->setAAMetadata(AAMD);
       V = IC.Builder->CreateInsertValue(V, L, i);
       Offset += EltSize;
     }
@@ -1192,7 +1205,11 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
                                                 AddrName);
       auto *Val = IC.Builder->CreateExtractValue(V, i, EltName);
       auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
-      IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
+      llvm::Instruction *NS =
+          IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
+      AAMDNodes AAMD;
+      SI.getAAMetadata(AAMD);
+      NS->setAAMetadata(AAMD);
     }
 
     return true;
@@ -1239,7 +1256,10 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
                                                 AddrName);
       auto *Val = IC.Builder->CreateExtractValue(V, i, EltName);
       auto EltAlign = MinAlign(Align, Offset);
-      IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
+      Instruction *NS = IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
+      AAMDNodes AAMD;
+      SI.getAAMetadata(AAMD);
+      NS->setAAMetadata(AAMD);
       Offset += EltSize;
     }
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 365c4ba75154..579639a6194e 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -227,8 +227,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
         if (I.hasNoUnsignedWrap())
           Shl->setHasNoUnsignedWrap();
         if (I.hasNoSignedWrap()) {
-          uint64_t V;
-          if (match(NewCst, m_ConstantInt(V)) && V != Width - 1)
+          const APInt *V;
+          if (match(NewCst, m_APInt(V)) && *V != Width - 1)
             Shl->setHasNoSignedWrap();
         }
 
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 33951e66497a..80c6595904e1 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1167,6 +1167,23 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (Instruction *I = canonicalizeSelectToShuffle(SI))
     return I;
 
+  // Canonicalize a one-use integer compare with a non-canonical predicate by
+  // inverting the predicate and swapping the select operands. This matches a
+  // compare canonicalization for conditional branches.
+  // TODO: Should we do the same for FP compares?
+  CmpInst::Predicate Pred;
+  if (match(CondVal, m_OneUse(m_ICmp(Pred, m_Value(), m_Value()))) &&
+      !isCanonicalPredicate(Pred)) {
+    // Swap true/false values and condition.
+    CmpInst *Cond = cast<CmpInst>(CondVal);
+    Cond->setPredicate(CmpInst::getInversePredicate(Pred));
+    SI.setOperand(1, FalseVal);
+    SI.setOperand(2, TrueVal);
+    SI.swapProfMetadata();
+    Worklist.Add(Cond);
+    return &SI;
+  }
+
   if (SelType->getScalarType()->isIntegerTy(1) &&
       TrueVal->getType() == CondVal->getType()) {
     if (match(TrueVal, m_One())) {
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 02fac4fb37a4..723414635d6f 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2425,9 +2425,15 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       Builder->SetInsertPoint(L);
       Value *GEP = Builder->CreateInBoundsGEP(L->getType(),
                                               L->getPointerOperand(), Indices);
+      Instruction *NL = Builder->CreateLoad(GEP);
+      // Whatever aliasing information we had for the orignal load must also
+      // hold for the smaller load, so propagate the annotations.
+      AAMDNodes Nodes;
+      L->getAAMetadata(Nodes);
+      NL->setAAMetadata(Nodes);
       // Returning the load directly will cause the main loop to insert it in
       // the wrong spot, so use replaceInstUsesWith().
-      return replaceInstUsesWith(EV, Builder->CreateLoad(GEP));
+      return replaceInstUsesWith(EV, NL);
     }
   // We could simplify extracts from other values. Note that nested extracts may
   // already be simplified implicitly by the above: extract (extract (insert) )
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index c3810366bf22..a49c9b68c97d 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -38,6 +38,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -340,6 +341,49 @@ void ConstantHoistingPass::collectConstantCandidates(
   }
 }
 
+
+/// \brief Check the operand for instruction Inst at index Idx.
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
+  Value *Opnd = Inst->getOperand(Idx);
+
+  // Visit constant integers.
+  if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+    collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+    return;
+  }
+
+  // Visit cast instructions that have constant integers.
+  if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+    // Only visit cast instructions, which have been skipped. All other
+    // instructions should have already been visited.
+    if (!CastInst->isCast())
+      return;
+
+    if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
+      // Pretend the constant is directly used by the instruction and ignore
+      // the cast instruction.
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+      return;
+    }
+  }
+
+  // Visit constant expressions that have constant integers.
+  if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+    // Only visit constant cast expressions.
+    if (!ConstExpr->isCast())
+      return;
+
+    if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
+      // Pretend the constant is directly used by the instruction and ignore
+      // the constant expression.
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+      return;
+    }
+  }
+}
+
+
 /// \brief Scan the instruction for expensive integer constants and record them
 /// in the constant candidate vector.
 void ConstantHoistingPass::collectConstantCandidates(
@@ -365,44 +409,25 @@ void ConstantHoistingPass::collectConstantCandidates(
   if (AI && AI->isStaticAlloca())
     return;
 
-  // Scan all operands.
-  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
-    Value *Opnd = Inst->getOperand(Idx);
-
-    // Visit constant integers.
-    if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
-      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
-      continue;
-    }
-
-    // Visit cast instructions that have constant integers.
-    if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
-      // Only visit cast instructions, which have been skipped. All other
-      // instructions should have already been visited.
-      if (!CastInst->isCast())
-        continue;
-
-      if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
-        // Pretend the constant is directly used by the instruction and ignore
-        // the cast instruction.
-        collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
-        continue;
+  // Constants in GEPs that index into a struct type should not be hoisted.
+  if (isa<GetElementPtrInst>(Inst)) {
+    gep_type_iterator GTI = gep_type_begin(Inst);
+
+    // Collect constant for first operand.
+    collectConstantCandidates(ConstCandMap, Inst, 0);
+    // Scan rest operands.
+    for (unsigned Idx = 1, E = Inst->getNumOperands(); Idx != E; ++Idx, ++GTI) {
+      // Only collect constants that index into a non struct type.
+      if (!GTI.isStruct()) {
+        collectConstantCandidates(ConstCandMap, Inst, Idx);
       }
     }
+    return;
+  }
 
-    // Visit constant expressions that have constant integers.
-    if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
-      // Only visit constant cast expressions.
-      if (!ConstExpr->isCast())
-        continue;
-
-      if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
-        // Pretend the constant is directly used by the instruction and ignore
-        // the constant expression.
-        collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
-        continue;
-      }
-    }
+  // Scan all operands.
+  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+    collectConstantCandidates(ConstCandMap, Inst, Idx);
   } // end of for all operands
 }
 
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 2f96c3064b86..a40c22c3fce9 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -917,7 +917,6 @@ LoopConstrainer::calculateSubRanges() const {
   // I think we can be more aggressive here and make this nuw / nsw if the
   // addition that feeds into the icmp for the latch's terminating branch is nuw
   // / nsw.  In any case, a wrapping 2's complement addition is safe.
-  ConstantInt *One = ConstantInt::get(Ty, 1);
   const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart);
   const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt);
 
@@ -948,8 +947,9 @@ LoopConstrainer::calculateSubRanges() const {
     //    will be an empty range.  Returning an empty range is always safe.
     //
 
-    Smallest = SE.getAddExpr(End, SE.getSCEV(One));
-    Greatest = SE.getAddExpr(Start, SE.getSCEV(One));
+    const SCEV *One = SE.getOne(Ty);
+    Smallest = SE.getAddExpr(End, One);
+    Greatest = SE.getAddExpr(Start, One);
   }
 
   auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 62aa6ee48069..530a68424d5c 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -131,7 +131,7 @@ static const unsigned NoThreshold = UINT_MAX;
 /// Gather the various unrolling parameters based on the defaults, compiler
 /// flags, TTI overrides and user specified parameters.
 static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
-    Loop *L, const TargetTransformInfo &TTI, int OptLevel,
+    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
     Optional<bool> UserUpperBound) {
@@ -158,7 +158,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   UP.AllowPeeling = true;
 
   // Override with any target specific settings
-  TTI.getUnrollingPreferences(L, UP);
+  TTI.getUnrollingPreferences(L, SE, UP);
 
   // Apply size attributes
   if (L->getHeader()->getParent()->optForSize()) {
@@ -699,7 +699,7 @@ static uint64_t getUnrolledLoopSize(
 // Calculates unroll count and writes it to UP.Count.
 static bool computeUnrollCount(
     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
-    ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
+    ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
     unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
     TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
   // Check for explicit Count.
@@ -770,7 +770,7 @@ static bool computeUnrollCount(
       // helps to remove a significant number of instructions.
       // To check that, run additional analysis on the loop.
       if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, FullUnrollTripCount, DT, *SE, TTI,
+              L, FullUnrollTripCount, DT, SE, TTI,
               UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
         unsigned Boost =
             getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
@@ -836,6 +836,8 @@ static bool computeUnrollCount(
     } else {
       UP.Count = TripCount;
     }
+    if (UP.Count > UP.MaxCount)
+      UP.Count = UP.MaxCount;
     if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
         UP.Count != TripCount)
       ORE->emit(
@@ -926,7 +928,7 @@ static bool computeUnrollCount(
 }
 
 static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
-                            ScalarEvolution *SE, const TargetTransformInfo &TTI,
+                            ScalarEvolution &SE, const TargetTransformInfo &TTI,
                             AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
                             bool PreserveLCSSA, int OptLevel,
                             Optional<unsigned> ProvidedCount,
@@ -948,8 +950,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   bool NotDuplicatable;
   bool Convergent;
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
-      L, TTI, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
-      ProvidedRuntime, ProvidedUpperBound);
+      L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
+      ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound);
   // Exit early if unrolling is disabled.
   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
     return false;
@@ -977,8 +979,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
     ExitingBlock = L->getExitingBlock();
   if (ExitingBlock) {
-    TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
-    TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
+    TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
+    TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
   }
 
   // If the loop contains a convergent operation, the prelude we'd add
@@ -1000,8 +1002,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   // count.
   bool MaxOrZero = false;
   if (!TripCount) {
-    MaxTripCount = SE->getSmallConstantMaxTripCount(L);
-    MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
+    MaxTripCount = SE.getSmallConstantMaxTripCount(L);
+    MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
     // We can unroll by the upper bound amount if it's generally allowed or if
     // we know that the loop is executed either the upper bound or zero times.
     // (MaxOrZero unrolling keeps only the first loop test, so the number of
@@ -1030,7 +1032,7 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   // Unroll the loop.
   if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
                   UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
-                  TripMultiple, UP.PeelCount, LI, SE, &DT, &AC, &ORE,
+                  TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE,
                   PreserveLCSSA))
     return false;
 
@@ -1073,7 +1075,7 @@ public:
 
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -1157,7 +1159,7 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
   if (!AllowPartialUnrolling)
     AllowPartialParam = RuntimeParam = UpperBoundParam = false;
   bool Changed = tryToUnrollLoop(
-      &L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
+      &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
       /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
       /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam);
   if (!Changed)
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 7a7624f77542..9cf01c6582b5 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -2423,8 +2423,7 @@ void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
   AllTempInstructions.insert(Op);
   PHIOfOpsPHIs[BB].push_back(Op);
   TempToBlock[Op] = BB;
-  if (ExistingValue)
-    RealToTemp[ExistingValue] = Op;
+  RealToTemp[ExistingValue] = Op;
 }
 
 static bool okayForPHIOfOps(const Instruction *I) {
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 6da551bd7efd..cdba0062953f 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -1894,6 +1894,8 @@ void ReassociatePass::EraseInst(Instruction *I) {
         Op = Op->user_back();
       RedoInsts.insert(Op);
     }
+
+  MadeChange = true;
 }
 
 // Canonicalize expressions of the following form:
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index a52739bb76f7..a73e9aec0617 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1954,7 +1954,7 @@ static void rematerializeLiveValues(CallSite CS,
       // to identify the newly generated AlternateRootPhi (.base version of phi)
       // and RootOfChain (the original phi node itself) are the same, so that we
       // can rematerialize the gep and casts. This is a workaround for the
-      // deficieny in the findBasePointer algorithm.
+      // deficiency in the findBasePointer algorithm.
       if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi))
         continue;
       // Now that the phi nodes are proved to be the same, assert that
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 80fbbeb6829b..4729f4ef5956 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -2402,9 +2402,20 @@ private:
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
 
+      // Any !nonnull metadata or !range metadata on the old load is also valid
+      // on the new load. This is even true in some cases even when the loads
+      // are different types, for example by mapping !nonnull metadata to
+      // !range metadata by modeling the null pointer constant converted to the
+      // integer type.
+      // FIXME: Add support for range metadata here. Currently the utilities
+      // for this don't propagate range metadata in trivial cases from one
+      // integer load to another, don't handle non-addrspace-0 null pointers
+      // correctly, and don't have any support for mapping ranges as the
+      // integer type becomes winder or narrower.
+      if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
+        copyNonnullMetadata(LI, N, *NewLI);
+
       // Try to preserve nonnull metadata
-      if (TargetTy->isPointerTy())
-        NewLI->copyMetadata(LI, LLVMContext::MD_nonnull);
       V = NewLI;
 
       // If this is an integer load past the end of the slice (which means the
@@ -3580,10 +3591,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     int Idx = 0, Size = Offsets.Splits.size();
     for (;;) {
       auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
-      auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      auto AS = LI->getPointerAddressSpace();
+      auto *PartPtrTy = PartTy->getPointerTo(AS);
       LoadInst *PLoad = IRB.CreateAlignedLoad(
           getAdjustedPtr(IRB, DL, BasePtr,
-                         APInt(DL.getPointerSizeInBits(), PartOffset),
+                         APInt(DL.getPointerSizeInBits(AS), PartOffset),
                          PartPtrTy, BasePtr->getName() + "."),
           getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
           LI->getName());
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 5d57ed9718fb..30d8856cfbef 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -59,6 +59,33 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB) {
   // Landing pads must be in the function where they were inserted for cleanup.
   if (BB.isEHPad())
     return false;
+  // taking the address of a basic block moved to another function is illegal
+  if (BB.hasAddressTaken())
+    return false;
+
+  // don't hoist code that uses another basicblock address, as it's likely to
+  // lead to unexpected behavior, like cross-function jumps
+  SmallPtrSet<User const *, 16> Visited;
+  SmallVector<User const *, 16> ToVisit;
+
+  for (Instruction const &Inst : BB)
+    ToVisit.push_back(&Inst);
+
+  while (!ToVisit.empty()) {
+    User const *Curr = ToVisit.pop_back_val();
+    if (!Visited.insert(Curr).second)
+      continue;
+    if (isa<BlockAddress const>(Curr))
+      return false; // even a reference to self is likely to be not compatible
+
+    if (isa<Instruction>(Curr) && cast<Instruction>(Curr)->getParent() != &BB)
+      continue;
+
+    for (auto const &U : Curr->operands()) {
+      if (auto *UU = dyn_cast<User>(U))
+        ToVisit.push_back(UU);
+    }
+  }
 
   // Don't hoist code containing allocas, invokes, or vastarts.
   for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 5f85e17927fa..9ad2b707e6b2 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <algorithm>
 
@@ -45,6 +46,10 @@ using namespace llvm;
 
 STATISTIC(NumRuntimeUnrolled,
           "Number of loops unrolled with run-time trip counts");
+static cl::opt<bool> UnrollRuntimeMultiExit(
+    "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
+    cl::desc("Allow runtime unrolling for loops with multiple exits, when "
+             "epilog is generated"));
 
 /// Connect the unrolling prolog code to the original loop.
 /// The unrolling prolog code contains code to execute the
@@ -285,15 +290,13 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
 /// The cloned blocks should be inserted between InsertTop and InsertBot.
 /// If loop structure is cloned InsertTop should be new preheader, InsertBot
 /// new loop exit.
-///
-static void CloneLoopBlocks(Loop *L, Value *NewIter,
-                            const bool CreateRemainderLoop,
-                            const bool UseEpilogRemainder,
-                            BasicBlock *InsertTop, BasicBlock *InsertBot,
-                            BasicBlock *Preheader,
-                            std::vector<BasicBlock *> &NewBlocks,
-                            LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
-                            DominatorTree *DT, LoopInfo *LI) {
+/// Return the new cloned loop that is created when CreateRemainderLoop is true.
+static Loop *
+CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
+                const bool UseEpilogRemainder, BasicBlock *InsertTop,
+                BasicBlock *InsertBot, BasicBlock *Preheader,
+                std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
+                ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
   StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
@@ -418,7 +421,10 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
     // Set operand 0 to refer to the loop id itself.
     NewLoopID->replaceOperandWith(0, NewLoopID);
     NewLoop->setLoopID(NewLoopID);
+    return NewLoop;
   }
+  else
+    return nullptr;
 }
 
 /// Insert code in the prolog/epilog code when unrolling a loop with a
@@ -465,29 +471,52 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                       LoopInfo *LI, ScalarEvolution *SE,
                                       DominatorTree *DT, bool PreserveLCSSA) {
   // for now, only unroll loops that contain a single exit
-  if (!L->getExitingBlock())
+  if (!UnrollRuntimeMultiExit && !L->getExitingBlock())
     return false;
 
-  // Make sure the loop is in canonical form, and there is a single
-  // exit block only.
+  // Make sure the loop is in canonical form.
   if (!L->isLoopSimplifyForm())
     return false;
 
   // Guaranteed by LoopSimplifyForm.
   BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
 
   BasicBlock *LatchExit = L->getUniqueExitBlock(); // successor out of loop
-  if (!LatchExit)
+  if (!LatchExit && !UnrollRuntimeMultiExit)
     return false;
+  // These are exit blocks other than the target of the latch exiting block.
+  SmallVector<BasicBlock *, 4> OtherExits;
+  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  unsigned int ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
   // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
-  // targets of the Latch be the single exit block out of the loop. This needs
+  // targets of the Latch be an exit block out of the loop. This needs
   // to be guaranteed by the callers of UnrollRuntimeLoopRemainder.
-  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
-  assert((LatchBR->getSuccessor(0) == LatchExit ||
-          LatchBR->getSuccessor(1) == LatchExit) &&
-         "one of the loop latch successors should be "
-         "the exit block!");
-  (void)LatchBR;
+  assert(!L->contains(LatchBR->getSuccessor(ExitIndex)) &&
+         "one of the loop latch successors should be the exit block!");
+  // Support runtime unrolling for multiple exit blocks and multiple exiting
+  // blocks.
+  if (!LatchExit) {
+    assert(UseEpilogRemainder && "Multi exit unrolling is currently supported "
+                                 "unrolling with epilog remainder only!");
+    LatchExit = LatchBR->getSuccessor(ExitIndex);
+    // We rely on LCSSA form being preserved when the exit blocks are
+    // transformed.
+    if (!PreserveLCSSA)
+      return false;
+    // TODO: Support multiple exiting blocks jumping to the `LatchExit`. This
+    // will need updating the logic in connectEpilog.
+    if (!LatchExit->getSinglePredecessor())
+        return false;
+    SmallVector<BasicBlock *, 4> Exits;
+    L->getUniqueExitBlocks(Exits);
+    for (auto *BB : Exits)
+      if (BB != LatchExit)
+        OtherExits.push_back(BB);
+  }
+
+  assert(LatchExit && "Latch Exit should exist!");
+
   // Use Scalar Evolution to compute the trip count. This allows more loops to
   // be unrolled than relying on induction var simplification.
   if (!SE)
@@ -495,7 +524,11 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
 
   // Only unroll loops with a computable trip count, and the trip count needs
   // to be an int value (allowing a pointer type is a TODO item).
-  const SCEV *BECountSC = SE->getBackedgeTakenCount(L);
+  // We calculate the backedge count by using getExitCount on the Latch block,
+  // which is proven to be the only exiting block in this loop. This is same as
+  // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
+  // exiting blocks).
+  const SCEV *BECountSC = SE->getExitCount(L, Latch);
   if (isa<SCEVCouldNotCompute>(BECountSC) ||
       !BECountSC->getType()->isIntegerTy())
     return false;
@@ -508,7 +541,6 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   if (isa<SCEVCouldNotCompute>(TripCountSC))
     return false;
 
-  BasicBlock *Header = L->getHeader();
   BasicBlock *PreHeader = L->getLoopPreheader();
   BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
   const DataLayout &DL = Header->getModule()->getDataLayout();
@@ -650,8 +682,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // iterations. This function adds the appropriate CFG connections.
   BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
   BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
-  CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop,
-                  InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
+  Loop *remainderLoop = CloneLoopBlocks(
+      L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot,
+      NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
 
   // Insert the cloned blocks into the function.
   F->getBasicBlockList().splice(InsertBot->getIterator(),
@@ -659,6 +692,42 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                 NewBlocks[0]->getIterator(),
                                 F->end());
 
+  // Now the loop blocks are cloned and the other exiting blocks from the
+  // remainder are connected to the original Loop's exit blocks. The remaining
+  // work is to update the phi nodes in the original loop, and take in the
+  // values from the cloned region. Also update the dominator info for
+  // OtherExits, since we have new edges into OtherExits.
+  for (auto *BB : OtherExits) {
+   for (auto &II : *BB) {
+
+     // Given we preserve LCSSA form, we know that the values used outside the
+     // loop will be used through these phi nodes at the exit blocks that are
+     // transformed below.
+     if (!isa<PHINode>(II))
+       break;
+     PHINode *Phi = cast<PHINode>(&II);
+     unsigned oldNumOperands = Phi->getNumIncomingValues();
+     // Add the incoming values from the remainder code to the end of the phi
+     // node.
+     for (unsigned i =0; i < oldNumOperands; i++){
+       Value *newVal = VMap[Phi->getIncomingValue(i)];
+       if (!newVal) {
+         assert(isa<Constant>(Phi->getIncomingValue(i)) &&
+                "VMap should exist for all values except constants!");
+         newVal = Phi->getIncomingValue(i);
+       }
+       Phi->addIncoming(newVal,
+                           cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
+     }
+   }
+   // Update the dominator info because the immediate dominator is no longer the
+   // header of the original Loop. BB has edges both from L and remainder code.
+   // Since the preheader determines which loop is run (L or directly jump to
+   // the remainder code), we set the immediate dominator as the preheader.
+   if (DT)
+     DT->changeImmediateDominator(BB, PreHeader);
+  }
+
   // Loop structure should be the following:
   //  Epilog             Prolog
   //
@@ -721,6 +790,19 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   if (Loop *ParentLoop = L->getParentLoop())
     SE->forgetLoop(ParentLoop);
 
+  // Canonicalize to LoopSimplifyForm both original and remainder loops. We
+  // cannot rely on the LoopUnrollPass to do this because it only does
+  // canonicalization for parent/subloops and not the sibling loops.
+  if (OtherExits.size() > 0) {
+    // Generate dedicated exit blocks for the original loop, to preserve
+    // LoopSimplifyForm.
+    formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA);
+    // Generate dedicated exit blocks for the remainder loop if one exists, to
+    // preserve LoopSimplifyForm.
+    if (remainderLoop)
+      formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA);
+  }
+
   NumRuntimeUnrolled++;
   return true;
 }
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 0ed33945ef40..58b70be95d99 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -528,8 +528,9 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
   return false;
 }
 
-bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
-                                                  DominatorTree *DT) {
+bool RecurrenceDescriptor::isFirstOrderRecurrence(
+    PHINode *Phi, Loop *TheLoop,
+    DenseMap<Instruction *, Instruction *> &SinkAfter, DominatorTree *DT) {
 
   // Ensure the phi node is in the loop header and has two incoming values.
   if (Phi->getParent() != TheLoop->getHeader() ||
@@ -551,12 +552,24 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
   // Get the previous value. The previous value comes from the latch edge while
   // the initial value comes form the preheader edge.
   auto *Previous = dyn_cast<Instruction>(Phi->getIncomingValueForBlock(Latch));
-  if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous))
+  if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous) ||
+      SinkAfter.count(Previous)) // Cannot rely on dominance due to motion.
     return false;
 
   // Ensure every user of the phi node is dominated by the previous value.
   // The dominance requirement ensures the loop vectorizer will not need to
   // vectorize the initial value prior to the first iteration of the loop.
+  // TODO: Consider extending this sinking to handle other kinds of instructions
+  // and expressions, beyond sinking a single cast past Previous.
+  if (Phi->hasOneUse()) {
+    auto *I = Phi->user_back();
+    if (I->isCast() && (I->getParent() == Phi->getParent()) && I->hasOneUse() &&
+        DT->dominates(Previous, I->user_back())) {
+      SinkAfter[I] = Previous;
+      return true;
+    }
+  }
+
   for (User *U : Phi->users())
     if (auto *I = dyn_cast<Instruction>(U)) {
       if (!DT->dominates(Previous, I))
diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 0a51f9a0e4a2..1c2a60a6b8b2 100644
--- a/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -27,7 +27,6 @@ void llvm::createMemCpyLoop(Instruction *InsertBefore,
   BasicBlock *LoopBB = BasicBlock::Create(F->getContext(), "loadstoreloop",
                                           F, NewBB);
 
-  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
   IRBuilder<> Builder(OrigBB->getTerminator());
 
   // SrcAddr and DstAddr are expected to be pointer types,
@@ -39,6 +38,11 @@ void llvm::createMemCpyLoop(Instruction *InsertBefore,
   SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
   DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
 
+  Builder.CreateCondBr(
+      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
+      LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
   IRBuilder<> LoopBuilder(LoopBB);
   PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
   LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
@@ -167,6 +171,7 @@ static void createMemMoveLoop(Instruction *InsertBefore,
 static void createMemSetLoop(Instruction *InsertBefore,
                              Value *DstAddr, Value *CopyLen, Value *SetValue,
                              unsigned Align, bool IsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
   Function *F = OrigBB->getParent();
   BasicBlock *NewBB =
@@ -174,7 +179,6 @@ static void createMemSetLoop(Instruction *InsertBefore,
   BasicBlock *LoopBB
     = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
 
-  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
   IRBuilder<> Builder(OrigBB->getTerminator());
 
   // Cast pointer to the type of value getting stored
@@ -182,9 +186,14 @@ static void createMemSetLoop(Instruction *InsertBefore,
   DstAddr = Builder.CreateBitCast(DstAddr,
                                   PointerType::get(SetValue->getType(), dstAS));
 
+  Builder.CreateCondBr(
+      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
+      LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
   IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0);
-  LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
 
   LoopBuilder.CreateStore(
       SetValue,
@@ -192,7 +201,7 @@ static void createMemSetLoop(Instruction *InsertBefore,
       IsVolatile);
 
   Value *NewIndex =
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1));
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
   LoopIndex->addIncoming(NewIndex, LoopBB);
 
   LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
diff --git a/lib/Transforms/Utils/OrderedInstructions.cpp b/lib/Transforms/Utils/OrderedInstructions.cpp
index 2e67e0def5b9..dc780542ce68 100644
--- a/lib/Transforms/Utils/OrderedInstructions.cpp
+++ b/lib/Transforms/Utils/OrderedInstructions.cpp
@@ -27,7 +27,6 @@ bool OrderedInstructions::dominates(const Instruction *InstA,
     if (OBB == OBBMap.end())
       OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first;
     return OBB->second->dominates(InstA, InstB);
-  } else {
-    return DT->dominates(InstA->getParent(), InstB->getParent());
   }
+  return DT->dominates(InstA->getParent(), InstB->getParent());
 }
diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp
index 1260e35e934d..d4cdaede6b86 100644
--- a/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/lib/Transforms/Utils/PredicateInfo.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -34,6 +33,7 @@
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/OrderedInstructions.h"
 #include <algorithm>
 #define DEBUG_TYPE "predicateinfo"
 using namespace llvm;
@@ -106,14 +106,27 @@ struct ValueDFS {
   bool EdgeOnly = false;
 };
 
+// Perform a strict weak ordering on instructions and arguments.
+static bool valueComesBefore(OrderedInstructions &OI, const Value *A,
+                             const Value *B) {
+  auto *ArgA = dyn_cast_or_null<Argument>(A);
+  auto *ArgB = dyn_cast_or_null<Argument>(B);
+  if (ArgA && !ArgB)
+    return true;
+  if (ArgB && !ArgA)
+    return false;
+  if (ArgA && ArgB)
+    return ArgA->getArgNo() < ArgB->getArgNo();
+  return OI.dominates(cast<Instruction>(A), cast<Instruction>(B));
+}
+
 // This compares ValueDFS structures, creating OrderedBasicBlocks where
 // necessary to compare uses/defs in the same block.  Doing so allows us to walk
 // the minimum number of instructions necessary to compute our def/use ordering.
 struct ValueDFS_Compare {
-  DenseMap<const BasicBlock *, std::unique_ptr<OrderedBasicBlock>> &OBBMap;
-  ValueDFS_Compare(
-      DenseMap<const BasicBlock *, std::unique_ptr<OrderedBasicBlock>> &OBBMap)
-      : OBBMap(OBBMap) {}
+  OrderedInstructions &OI;
+  ValueDFS_Compare(OrderedInstructions &OI) : OI(OI) {}
+
   bool operator()(const ValueDFS &A, const ValueDFS &B) const {
     if (&A == &B)
       return false;
@@ -196,23 +209,12 @@ struct ValueDFS_Compare {
     auto *ArgA = dyn_cast_or_null<Argument>(ADef);
     auto *ArgB = dyn_cast_or_null<Argument>(BDef);
 
-    if (ArgA && !ArgB)
-      return true;
-    if (ArgB && !ArgA)
-      return false;
-    if (ArgA && ArgB)
-      return ArgA->getArgNo() < ArgB->getArgNo();
+    if (ArgA || ArgB)
+      return valueComesBefore(OI, ArgA, ArgB);
 
     auto *AInst = getDefOrUser(ADef, A.U);
     auto *BInst = getDefOrUser(BDef, B.U);
-
-    auto *BB = AInst->getParent();
-    auto LookupResult = OBBMap.find(BB);
-    if (LookupResult != OBBMap.end())
-      return LookupResult->second->dominates(AInst, BInst);
-
-    auto Result = OBBMap.insert({BB, make_unique<OrderedBasicBlock>(BB)});
-    return Result.first->second->dominates(AInst, BInst);
+    return valueComesBefore(OI, AInst, BInst);
   }
 };
 
@@ -547,38 +549,11 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
 void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
   // Sort OpsToRename since we are going to iterate it.
   SmallVector<Value *, 8> OpsToRename(OpSet.begin(), OpSet.end());
-  std::sort(OpsToRename.begin(), OpsToRename.end(), [&](const Value *A,
-                                                        const Value *B) {
-    auto *ArgA = dyn_cast_or_null<Argument>(A);
-    auto *ArgB = dyn_cast_or_null<Argument>(B);
-
-    // If A and B are args, order them based on their arg no.
-    if (ArgA && !ArgB)
-      return true;
-    if (ArgB && !ArgA)
-      return false;
-    if (ArgA && ArgB)
-      return ArgA->getArgNo() < ArgB->getArgNo();
-
-    // Else, A are B are instructions.
-    // If they belong to different BBs, order them by the dominance of BBs.
-    auto *AInst = cast<Instruction>(A);
-    auto *BInst = cast<Instruction>(B);
-    if (AInst->getParent() != BInst->getParent())
-      return DT.dominates(AInst->getParent(), BInst->getParent());
-
-    // Else, A and B belong to the same BB.
-    // Order A and B by their dominance.
-    auto *BB = AInst->getParent();
-    auto LookupResult = OBBMap.find(BB);
-    if (LookupResult != OBBMap.end())
-      return LookupResult->second->dominates(AInst, BInst);
-
-    auto Result = OBBMap.insert({BB, make_unique<OrderedBasicBlock>(BB)});
-    return Result.first->second->dominates(AInst, BInst);
-  });
-
-  ValueDFS_Compare Compare(OBBMap);
+  auto Comparator = [&](const Value *A, const Value *B) {
+    return valueComesBefore(OI, A, B);
+  };
+  std::sort(OpsToRename.begin(), OpsToRename.end(), Comparator);
+  ValueDFS_Compare Compare(OI);
   // Compute liveness, and rename in O(uses) per Op.
   for (auto *Op : OpsToRename) {
     unsigned Counter = 0;
@@ -715,7 +690,7 @@ PredicateInfo::getValueInfo(Value *Operand) const {
 
 PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
                              AssumptionCache &AC)
-    : F(F), DT(DT), AC(AC) {
+    : F(F), DT(DT), AC(AC), OI(&DT) {
   // Push an empty operand info so that we can detect 0 as not finding one
   ValueInfos.resize(1);
   buildPredicateInfo();
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 0970c436e665..e724b0a28c32 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -4781,7 +4781,7 @@ public:
   SwitchLookupTable(
       Module &M, uint64_t TableSize, ConstantInt *Offset,
       const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
-      Constant *DefaultValue, const DataLayout &DL);
+      Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName);
 
   /// Build instructions with Builder to retrieve the value at
   /// the position given by Index in the lookup table.
@@ -4835,7 +4835,7 @@ private:
 SwitchLookupTable::SwitchLookupTable(
     Module &M, uint64_t TableSize, ConstantInt *Offset,
     const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
-    Constant *DefaultValue, const DataLayout &DL)
+    Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName)
     : SingleValue(nullptr), BitMap(nullptr), BitMapElementTy(nullptr),
       LinearOffset(nullptr), LinearMultiplier(nullptr), Array(nullptr) {
   assert(Values.size() && "Can't build lookup table without values!");
@@ -4943,7 +4943,7 @@ SwitchLookupTable::SwitchLookupTable(
 
   Array = new GlobalVariable(M, ArrayTy, /*constant=*/true,
                              GlobalVariable::PrivateLinkage, Initializer,
-                             "switch.table");
+                             "switch.table." + FuncName);
   Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   Kind = ArrayKind;
 }
@@ -5333,7 +5333,9 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
     // If using a bitmask, use any value to fill the lookup table holes.
     Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI];
-    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL);
+    StringRef FuncName = SI->getParent()->getParent()->getName();
+    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL,
+                            FuncName);
 
     Value *Result = Table.BuildLookup(TableIndex, Builder);
 
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index faa14046b1e3..ec8b0d426265 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -354,7 +354,7 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
   typedef const SCEV *(ScalarEvolution::*OperationFunctionTy)(
       const SCEV *, const SCEV *, SCEV::NoWrapFlags, unsigned);
   typedef const SCEV *(ScalarEvolution::*ExtensionFunctionTy)(
-      const SCEV *, Type *);
+      const SCEV *, Type *, unsigned);
 
   OperationFunctionTy Operation;
   ExtensionFunctionTy Extension;
@@ -406,11 +406,11 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
     IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2);
 
   const SCEV *A =
-      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0u),
-                       WideTy);
+      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0),
+                       WideTy, 0);
   const SCEV *B =
-      (SE->*Operation)((SE->*Extension)(LHS, WideTy),
-                       (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap, 0u);
+      (SE->*Operation)((SE->*Extension)(LHS, WideTy, 0),
+                       (SE->*Extension)(RHS, WideTy, 0), SCEV::FlagAnyWrap, 0);
 
   if (A != B)
     return false;
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
deleted file mode 100644
index 78453aaa16ce..000000000000
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ /dev/null
@@ -1,3282 +0,0 @@
-//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a basic-block vectorization pass. The algorithm was
-// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
-// et al. It works by looking for chains of pairable operations and then
-// pairing them.
-//
-//===----------------------------------------------------------------------===//
-
-#define BBV_NAME "bb-vectorize"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Vectorize.h"
-#include <algorithm>
-using namespace llvm;
-
-#define DEBUG_TYPE BBV_NAME
-
-static cl::opt<bool>
-IgnoreTargetInfo("bb-vectorize-ignore-target-info",  cl::init(false),
-  cl::Hidden, cl::desc("Ignore target information"));
-
-static cl::opt<unsigned>
-ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
-  cl::desc("The required chain depth for vectorization"));
-
-static cl::opt<bool>
-UseChainDepthWithTI("bb-vectorize-use-chain-depth",  cl::init(false),
-  cl::Hidden, cl::desc("Use the chain depth requirement with"
-                       " target information"));
-
-static cl::opt<unsigned>
-SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
-  cl::desc("The maximum search distance for instruction pairs"));
-
-static cl::opt<bool>
-SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
-  cl::desc("Replicating one element to a pair breaks the chain"));
-
-static cl::opt<unsigned>
-VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
-  cl::desc("The size of the native vector registers"));
-
-static cl::opt<unsigned>
-MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
-  cl::desc("The maximum number of pairing iterations"));
-
-static cl::opt<bool>
-Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to form non-2^n-length vectors"));
-
-static cl::opt<unsigned>
-MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
-  cl::desc("The maximum number of pairable instructions per group"));
-
-static cl::opt<unsigned>
-MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
-  cl::desc("The maximum number of candidate instruction pairs per group"));
-
-static cl::opt<unsigned>
-MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
-  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
-                       " a full cycle check"));
-
-static cl::opt<bool>
-NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize boolean (i1) values"));
-
-static cl::opt<bool>
-NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize integer values"));
-
-static cl::opt<bool>
-NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize floating-point values"));
-
-// FIXME: This should default to false once pointer vector support works.
-static cl::opt<bool>
-NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
-  cl::desc("Don't try to vectorize pointer values"));
-
-static cl::opt<bool>
-NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize casting (conversion) operations"));
-
-static cl::opt<bool>
-NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize floating-point math intrinsics"));
-
-static cl::opt<bool>
-  NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize BitManipulation intrinsics"));
-
-static cl::opt<bool>
-NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
-
-static cl::opt<bool>
-NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize select instructions"));
-
-static cl::opt<bool>
-NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize comparison instructions"));
-
-static cl::opt<bool>
-NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize getelementptr instructions"));
-
-static cl::opt<bool>
-NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize loads and stores"));
-
-static cl::opt<bool>
-AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
-  cl::desc("Only generate aligned loads and stores"));
-
-static cl::opt<bool>
-NoMemOpBoost("bb-vectorize-no-mem-op-boost",
-  cl::init(false), cl::Hidden,
-  cl::desc("Don't boost the chain-depth contribution of loads and stores"));
-
-static cl::opt<bool>
-FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
-  cl::desc("Use a fast instruction dependency analysis"));
-
-#ifndef NDEBUG
-static cl::opt<bool>
-DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " instruction-examination process"));
-static cl::opt<bool>
-DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " candidate-selection process"));
-static cl::opt<bool>
-DebugPairSelection("bb-vectorize-debug-pair-selection",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " pair-selection process"));
-static cl::opt<bool>
-DebugCycleCheck("bb-vectorize-debug-cycle-check",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " cycle-checking process"));
-
-static cl::opt<bool>
-PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, dump the basic block after"
-           " every pair is fused"));
-#endif
-
-STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
-
-namespace {
-  struct BBVectorize : public BasicBlockPass {
-    static char ID; // Pass identification, replacement for typeid
-
-    const VectorizeConfig Config;
-
-    BBVectorize(const VectorizeConfig &C = VectorizeConfig())
-      : BasicBlockPass(ID), Config(C) {
-      initializeBBVectorizePass(*PassRegistry::getPassRegistry());
-    }
-
-    BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
-      : BasicBlockPass(ID), Config(C) {
-      AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults();
-      DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-      TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-      TTI = IgnoreTargetInfo
-                ? nullptr
-                : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    }
-
-    typedef std::pair<Value *, Value *> ValuePair;
-    typedef std::pair<ValuePair, int> ValuePairWithCost;
-    typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
-    typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
-    typedef std::pair<VPPair, unsigned> VPPairWithType;
-
-    AliasAnalysis *AA;
-    DominatorTree *DT;
-    ScalarEvolution *SE;
-    const TargetLibraryInfo *TLI;
-    const TargetTransformInfo *TTI;
-
-    // FIXME: const correct?
-
-    bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
-
-    bool getCandidatePairs(BasicBlock &BB,
-                       BasicBlock::iterator &Start,
-                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                       DenseSet<ValuePair> &FixedOrderPairs,
-                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                       std::vector<Value *> &PairableInsts, bool NonPow2Len);
-
-    // FIXME: The current implementation does not account for pairs that
-    // are connected in multiple ways. For example:
-    //   C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
-    enum PairConnectionType {
-      PairConnectionDirect,
-      PairConnectionSwap,
-      PairConnectionSplat
-    };
-
-    void computeConnectedPairs(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes);
-
-    void buildDepMap(BasicBlock &BB,
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &PairableInstUsers);
-
-    void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             DenseMap<ValuePair, int> &CandidatePairCostSavings,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<Value *, Value *>& ChosenPairs);
-
-    void fuseChosenPairs(BasicBlock &BB,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<Value *, Value *>& ChosenPairs,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
-
-
-    bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
-
-    bool areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore, bool NonPow2Len,
-                       int &CostSavings, int &FixedOrder);
-
-    bool trackUsesOfI(DenseSet<Value *> &Users,
-                      AliasSetTracker &WriteSet, Instruction *I,
-                      Instruction *J, bool UpdateUsers = true,
-                      DenseSet<ValuePair> *LoadMoveSetPairs = nullptr);
-
-  void computePairsConnectedTo(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             ValuePair P);
-
-    bool pairsConflict(ValuePair P, ValuePair Q,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> >
-               *PairableInstUserMap = nullptr,
-             DenseSet<VPPair> *PairableInstUserPairSet = nullptr);
-
-    bool pairWillFormCycle(ValuePair P,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
-             DenseSet<ValuePair> &CurrentPairs);
-
-    void pruneDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<VPPair> &PairableInstUserPairSet,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseMap<ValuePair, size_t> &DAG,
-             DenseSet<ValuePair> &PrunedDAG, ValuePair J,
-             bool UseCycleCheck);
-
-    void buildInitialDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseMap<ValuePair, size_t> &DAG, ValuePair J);
-
-    void findBestDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             DenseMap<ValuePair, int> &CandidatePairCostSavings,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<VPPair> &PairableInstUserPairSet,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
-             int &BestEffSize, Value *II, std::vector<Value *>&JJ,
-             bool UseCycleCheck);
-
-    Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o);
-
-    void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned MaskOffset, unsigned NumInElem,
-                     unsigned NumInElem1, unsigned IdxOffset,
-                     std::vector<Constant*> &Mask);
-
-    Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
-                     Instruction *J);
-
-    bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
-                       unsigned o, Value *&LOp, unsigned numElemL,
-                       Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
-                       unsigned IdxOff = 0);
-
-    Value *getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool IBeforeJ);
-
-    void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
-                     bool IBeforeJ);
-
-    void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, Instruction *K,
-                     Instruction *&InsertionPt, Instruction *&K1,
-                     Instruction *&K2);
-
-    void collectPairLoadMoveSet(BasicBlock &BB,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I);
-
-    void collectLoadMoveSet(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs);
-
-    bool canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I, Instruction *J);
-
-    void moveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *&InsertionPt,
-                     Instruction *I, Instruction *J);
-
-    bool vectorizeBB(BasicBlock &BB) {
-      if (skipBasicBlock(BB))
-        return false;
-      if (!DT->isReachableFromEntry(&BB)) {
-        DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
-              " in " << BB.getParent()->getName() << "\n");
-        return false;
-      }
-
-      DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
-
-      bool changed = false;
-      // Iterate a sufficient number of times to merge types of size 1 bit,
-      // then 2 bits, then 4, etc. up to half of the target vector width of the
-      // target vector register.
-      unsigned n = 1;
-      for (unsigned v = 2;
-           (TTI || v <= Config.VectorBits) &&
-           (!Config.MaxIter || n <= Config.MaxIter);
-           v *= 2, ++n) {
-        DEBUG(dbgs() << "BBV: fusing loop #" << n <<
-              " for " << BB.getName() << " in " <<
-              BB.getParent()->getName() << "...\n");
-        if (vectorizePairs(BB))
-          changed = true;
-        else
-          break;
-      }
-
-      if (changed && !Pow2LenOnly) {
-        ++n;
-        for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
-          DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
-                n << " for " << BB.getName() << " in " <<
-                BB.getParent()->getName() << "...\n");
-          if (!vectorizePairs(BB, true)) break;
-        }
-      }
-
-      DEBUG(dbgs() << "BBV: done!\n");
-      return changed;
-    }
-
-    bool runOnBasicBlock(BasicBlock &BB) override {
-      // OptimizeNone check deferred to vectorizeBB().
-
-      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-      TTI = IgnoreTargetInfo
-                ? nullptr
-                : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-                      *BB.getParent());
-
-      return vectorizeBB(BB);
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      BasicBlockPass::getAnalysisUsage(AU);
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<ScalarEvolutionWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addRequired<TargetTransformInfoWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<ScalarEvolutionWrapperPass>();
-      AU.addPreserved<SCEVAAWrapperPass>();
-      AU.setPreservesCFG();
-    }
-
-    static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
-      assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
-             "Cannot form vector from incompatible scalar types");
-      Type *STy = ElemTy->getScalarType();
-
-      unsigned numElem;
-      if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
-        numElem = VTy->getNumElements();
-      } else {
-        numElem = 1;
-      }
-
-      if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
-        numElem += VTy->getNumElements();
-      } else {
-        numElem += 1;
-      }
-
-      return VectorType::get(STy, numElem);
-    }
-
-    static inline void getInstructionTypes(Instruction *I,
-                                           Type *&T1, Type *&T2) {
-      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        // For stores, it is the value type, not the pointer type that matters
-        // because the value is what will come from a vector register.
-
-        Value *IVal = SI->getValueOperand();
-        T1 = IVal->getType();
-      } else {
-        T1 = I->getType();
-      }
-
-      if (CastInst *CI = dyn_cast<CastInst>(I))
-        T2 = CI->getSrcTy();
-      else
-        T2 = T1;
-
-      if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
-        T2 = SI->getCondition()->getType();
-      } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
-        T2 = SI->getOperand(0)->getType();
-      } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
-        T2 = CI->getOperand(0)->getType();
-      }
-    }
-
-    // Returns the weight associated with the provided value. A chain of
-    // candidate pairs has a length given by the sum of the weights of its
-    // members (one weight per pair; the weight of each member of the pair
-    // is assumed to be the same). This length is then compared to the
-    // chain-length threshold to determine if a given chain is significant
-    // enough to be vectorized. The length is also used in comparing
-    // candidate chains where longer chains are considered to be better.
-    // Note: when this function returns 0, the resulting instructions are
-    // not actually fused.
-    inline size_t getDepthFactor(Value *V) {
-      // InsertElement and ExtractElement have a depth factor of zero. This is
-      // for two reasons: First, they cannot be usefully fused. Second, because
-      // the pass generates a lot of these, they can confuse the simple metric
-      // used to compare the dags in the next iteration. Thus, giving them a
-      // weight of zero allows the pass to essentially ignore them in
-      // subsequent iterations when looking for vectorization opportunities
-      // while still tracking dependency chains that flow through those
-      // instructions.
-      if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
-        return 0;
-
-      // Give a load or store half of the required depth so that load/store
-      // pairs will vectorize.
-      if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
-        return Config.ReqChainDepth/2;
-
-      return 1;
-    }
-
-    // Returns the cost of the provided instruction using TTI.
-    // This does not handle loads and stores.
-    unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
-                          TargetTransformInfo::OperandValueKind Op1VK =
-                              TargetTransformInfo::OK_AnyValue,
-                          TargetTransformInfo::OperandValueKind Op2VK =
-                              TargetTransformInfo::OK_AnyValue,
-                          const Instruction *I = nullptr) {
-      switch (Opcode) {
-      default: break;
-      case Instruction::GetElementPtr:
-        // We mark this instruction as zero-cost because scalar GEPs are usually
-        // lowered to the instruction addressing mode. At the moment we don't
-        // generate vector GEPs.
-        return 0;
-      case Instruction::Br:
-        return TTI->getCFInstrCost(Opcode);
-      case Instruction::PHI:
-        return 0;
-      case Instruction::Add:
-      case Instruction::FAdd:
-      case Instruction::Sub:
-      case Instruction::FSub:
-      case Instruction::Mul:
-      case Instruction::FMul:
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::FDiv:
-      case Instruction::URem:
-      case Instruction::SRem:
-      case Instruction::FRem:
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-      case Instruction::And:
-      case Instruction::Or:
-      case Instruction::Xor:
-        return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
-      case Instruction::Select:
-      case Instruction::ICmp:
-      case Instruction::FCmp:
-        return TTI->getCmpSelInstrCost(Opcode, T1, T2, I);
-      case Instruction::ZExt:
-      case Instruction::SExt:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::FPExt:
-      case Instruction::PtrToInt:
-      case Instruction::IntToPtr:
-      case Instruction::SIToFP:
-      case Instruction::UIToFP:
-      case Instruction::Trunc:
-      case Instruction::FPTrunc:
-      case Instruction::BitCast:
-      case Instruction::ShuffleVector:
-        return TTI->getCastInstrCost(Opcode, T1, T2, I);
-      }
-
-      return 1;
-    }
-
-    // This determines the relative offset of two loads or stores, returning
-    // true if the offset could be determined to be some constant value.
-    // For example, if OffsetInElmts == 1, then J accesses the memory directly
-    // after I; if OffsetInElmts == -1 then I accesses the memory
-    // directly after J.
-    bool getPairPtrInfo(Instruction *I, Instruction *J,
-        Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
-        unsigned &IAddressSpace, unsigned &JAddressSpace,
-        int64_t &OffsetInElmts, bool ComputeOffset = true) {
-      OffsetInElmts = 0;
-      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        LoadInst *LJ = cast<LoadInst>(J);
-        IPtr = LI->getPointerOperand();
-        JPtr = LJ->getPointerOperand();
-        IAlignment = LI->getAlignment();
-        JAlignment = LJ->getAlignment();
-        IAddressSpace = LI->getPointerAddressSpace();
-        JAddressSpace = LJ->getPointerAddressSpace();
-      } else {
-        StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
-        IPtr = SI->getPointerOperand();
-        JPtr = SJ->getPointerOperand();
-        IAlignment = SI->getAlignment();
-        JAlignment = SJ->getAlignment();
-        IAddressSpace = SI->getPointerAddressSpace();
-        JAddressSpace = SJ->getPointerAddressSpace();
-      }
-
-      if (!ComputeOffset)
-        return true;
-
-      const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
-      const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
-
-      // If this is a trivial offset, then we'll get something like
-      // 1*sizeof(type). With target data, which we need anyway, this will get
-      // constant folded into a number.
-      const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
-      if (const SCEVConstant *ConstOffSCEV =
-            dyn_cast<SCEVConstant>(OffsetSCEV)) {
-        ConstantInt *IntOff = ConstOffSCEV->getValue();
-        int64_t Offset = IntOff->getSExtValue();
-        const DataLayout &DL = I->getModule()->getDataLayout();
-        Type *VTy = IPtr->getType()->getPointerElementType();
-        int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
-
-        Type *VTy2 = JPtr->getType()->getPointerElementType();
-        if (VTy != VTy2 && Offset < 0) {
-          int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
-          OffsetInElmts = Offset/VTy2TSS;
-          return (std::abs(Offset) % VTy2TSS) == 0;
-        }
-
-        OffsetInElmts = Offset/VTyTSS;
-        return (std::abs(Offset) % VTyTSS) == 0;
-      }
-
-      return false;
-    }
-
-    // Returns true if the provided CallInst represents an intrinsic that can
-    // be vectorized.
-    bool isVectorizableIntrinsic(CallInst* I) {
-      Function *F = I->getCalledFunction();
-      if (!F) return false;
-
-      Intrinsic::ID IID = F->getIntrinsicID();
-      if (!IID) return false;
-
-      switch(IID) {
-      default:
-        return false;
-      case Intrinsic::sqrt:
-      case Intrinsic::powi:
-      case Intrinsic::sin:
-      case Intrinsic::cos:
-      case Intrinsic::log:
-      case Intrinsic::log2:
-      case Intrinsic::log10:
-      case Intrinsic::exp:
-      case Intrinsic::exp2:
-      case Intrinsic::pow:
-      case Intrinsic::round:
-      case Intrinsic::copysign:
-      case Intrinsic::ceil:
-      case Intrinsic::nearbyint:
-      case Intrinsic::rint:
-      case Intrinsic::trunc:
-      case Intrinsic::floor:
-      case Intrinsic::fabs:
-      case Intrinsic::minnum:
-      case Intrinsic::maxnum:
-        return Config.VectorizeMath;
-      case Intrinsic::bswap:
-      case Intrinsic::ctpop:
-      case Intrinsic::ctlz:
-      case Intrinsic::cttz:
-        return Config.VectorizeBitManipulations;
-      case Intrinsic::fma:
-      case Intrinsic::fmuladd:
-        return Config.VectorizeFMA;
-      }
-    }
-
-    bool isPureIEChain(InsertElementInst *IE) {
-      InsertElementInst *IENext = IE;
-      do {
-        if (!isa<UndefValue>(IENext->getOperand(0)) &&
-            !isa<InsertElementInst>(IENext->getOperand(0))) {
-          return false;
-        }
-      } while ((IENext =
-                 dyn_cast<InsertElementInst>(IENext->getOperand(0))));
-
-      return true;
-    }
-  };
-
-  // This function implements one vectorization iteration on the provided
-  // basic block. It returns true if the block is changed.
-  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
-    bool ShouldContinue;
-    BasicBlock::iterator Start = BB.getFirstInsertionPt();
-
-    std::vector<Value *> AllPairableInsts;
-    DenseMap<Value *, Value *> AllChosenPairs;
-    DenseSet<ValuePair> AllFixedOrderPairs;
-    DenseMap<VPPair, unsigned> AllPairConnectionTypes;
-    DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
-                                                 AllConnectedPairDeps;
-
-    do {
-      std::vector<Value *> PairableInsts;
-      DenseMap<Value *, std::vector<Value *> > CandidatePairs;
-      DenseSet<ValuePair> FixedOrderPairs;
-      DenseMap<ValuePair, int> CandidatePairCostSavings;
-      ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
-                                         FixedOrderPairs,
-                                         CandidatePairCostSavings,
-                                         PairableInsts, NonPow2Len);
-      if (PairableInsts.empty()) continue;
-
-      // Build the candidate pair set for faster lookups.
-      DenseSet<ValuePair> CandidatePairsSet;
-      for (DenseMap<Value *, std::vector<Value *> >::iterator I =
-           CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
-        for (std::vector<Value *>::iterator J = I->second.begin(),
-             JE = I->second.end(); J != JE; ++J)
-          CandidatePairsSet.insert(ValuePair(I->first, *J));
-
-      // Now we have a map of all of the pairable instructions and we need to
-      // select the best possible pairing. A good pairing is one such that the
-      // users of the pair are also paired. This defines a (directed) forest
-      // over the pairs such that two pairs are connected iff the second pair
-      // uses the first.
-
-      // Note that it only matters that both members of the second pair use some
-      // element of the first pair (to allow for splatting).
-
-      DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
-                                                   ConnectedPairDeps;
-      DenseMap<VPPair, unsigned> PairConnectionTypes;
-      computeConnectedPairs(CandidatePairs, CandidatePairsSet,
-                            PairableInsts, ConnectedPairs, PairConnectionTypes);
-      if (ConnectedPairs.empty()) continue;
-
-      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
-           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I)
-        for (std::vector<ValuePair>::iterator J = I->second.begin(),
-             JE = I->second.end(); J != JE; ++J)
-          ConnectedPairDeps[*J].push_back(I->first);
-
-      // Build the pairable-instruction dependency map
-      DenseSet<ValuePair> PairableInstUsers;
-      buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
-
-      // There is now a graph of the connected pairs. For each variable, pick
-      // the pairing with the largest dag meeting the depth requirement on at
-      // least one branch. Then select all pairings that are part of that dag
-      // and remove them from the list of available pairings and pairable
-      // variables.
-
-      DenseMap<Value *, Value *> ChosenPairs;
-      choosePairs(CandidatePairs, CandidatePairsSet,
-        CandidatePairCostSavings,
-        PairableInsts, FixedOrderPairs, PairConnectionTypes,
-        ConnectedPairs, ConnectedPairDeps,
-        PairableInstUsers, ChosenPairs);
-
-      if (ChosenPairs.empty()) continue;
-      AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
-                              PairableInsts.end());
-      AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
-
-      // Only for the chosen pairs, propagate information on fixed-order pairs,
-      // pair connections, and their types to the data structures used by the
-      // pair fusion procedures.
-      for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
-           IE = ChosenPairs.end(); I != IE; ++I) {
-        if (FixedOrderPairs.count(*I))
-          AllFixedOrderPairs.insert(*I);
-        else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
-          AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
-
-        for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
-             J != IE; ++J) {
-          DenseMap<VPPair, unsigned>::iterator K =
-            PairConnectionTypes.find(VPPair(*I, *J));
-          if (K != PairConnectionTypes.end()) {
-            AllPairConnectionTypes.insert(*K);
-          } else {
-            K = PairConnectionTypes.find(VPPair(*J, *I));
-            if (K != PairConnectionTypes.end())
-              AllPairConnectionTypes.insert(*K);
-          }
-        }
-      }
-
-      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
-           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I)
-        for (std::vector<ValuePair>::iterator J = I->second.begin(),
-          JE = I->second.end(); J != JE; ++J)
-          if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
-            AllConnectedPairs[I->first].push_back(*J);
-            AllConnectedPairDeps[*J].push_back(I->first);
-          }
-    } while (ShouldContinue);
-
-    if (AllChosenPairs.empty()) return false;
-    NumFusedOps += AllChosenPairs.size();
-
-    // A set of pairs has now been selected. It is now necessary to replace the
-    // paired instructions with vector instructions. For this procedure each
-    // operand must be replaced with a vector operand. This vector is formed
-    // by using build_vector on the old operands. The replaced values are then
-    // replaced with a vector_extract on the result.  Subsequent optimization
-    // passes should coalesce the build/extract combinations.
-
-    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
-                    AllPairConnectionTypes,
-                    AllConnectedPairs, AllConnectedPairDeps);
-
-    // It is important to cleanup here so that future iterations of this
-    // function have less work to do.
-    (void)SimplifyInstructionsInBlock(&BB, TLI);
-    return true;
-  }
-
-  // This function returns true if the provided instruction is capable of being
-  // fused into a vector instruction. This determination is based only on the
-  // type and other attributes of the instruction.
-  bool BBVectorize::isInstVectorizable(Instruction *I,
-                                         bool &IsSimpleLoadStore) {
-    IsSimpleLoadStore = false;
-
-    if (CallInst *C = dyn_cast<CallInst>(I)) {
-      if (!isVectorizableIntrinsic(C))
-        return false;
-    } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
-      // Vectorize simple loads if possbile:
-      IsSimpleLoadStore = L->isSimple();
-      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
-        return false;
-    } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
-      // Vectorize simple stores if possbile:
-      IsSimpleLoadStore = S->isSimple();
-      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
-        return false;
-    } else if (CastInst *C = dyn_cast<CastInst>(I)) {
-      // We can vectorize casts, but not casts of pointer types, etc.
-      if (!Config.VectorizeCasts)
-        return false;
-
-      Type *SrcTy = C->getSrcTy();
-      if (!SrcTy->isSingleValueType())
-        return false;
-
-      Type *DestTy = C->getDestTy();
-      if (!DestTy->isSingleValueType())
-        return false;
-    } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
-      if (!Config.VectorizeSelect)
-        return false;
-      // We can vectorize a select if either all operands are scalars,
-      // or all operands are vectors. Trying to "widen" a select between
-      // vectors that has a scalar condition results in a malformed select.
-      // FIXME: We could probably be smarter about this by rewriting the select
-      // with different types instead.
-      return (SI->getCondition()->getType()->isVectorTy() ==
-              SI->getTrueValue()->getType()->isVectorTy());
-    } else if (isa<CmpInst>(I)) {
-      if (!Config.VectorizeCmp)
-        return false;
-    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
-      if (!Config.VectorizeGEP)
-        return false;
-
-      // Currently, vector GEPs exist only with one index.
-      if (G->getNumIndices() != 1)
-        return false;
-    } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
-        isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
-      return false;
-    }
-
-    Type *T1, *T2;
-    getInstructionTypes(I, T1, T2);
-
-    // Not every type can be vectorized...
-    if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
-        !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
-      return false;
-
-    if (T1->getScalarSizeInBits() == 1) {
-      if (!Config.VectorizeBools)
-        return false;
-    } else {
-      if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
-        return false;
-    }
-
-    if (T2->getScalarSizeInBits() == 1) {
-      if (!Config.VectorizeBools)
-        return false;
-    } else {
-      if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
-        return false;
-    }
-
-    if (!Config.VectorizeFloats
-        && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
-      return false;
-
-    // Don't vectorize target-specific types.
-    if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
-      return false;
-    if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
-      return false;
-
-    if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
-                                      T2->getScalarType()->isPointerTy()))
-      return false;
-
-    if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
-                 T2->getPrimitiveSizeInBits() >= Config.VectorBits))
-      return false;
-
-    return true;
-  }
-
-  // This function returns true if the two provided instructions are compatible
-  // (meaning that they can be fused into a vector instruction). This assumes
-  // that I has already been determined to be vectorizable and that J is not
-  // in the use dag of I.
-  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore, bool NonPow2Len,
-                       int &CostSavings, int &FixedOrder) {
-    DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
-                     " <-> " << *J << "\n");
-
-    CostSavings = 0;
-    FixedOrder = 0;
-
-    // Loads and stores can be merged if they have different alignments,
-    // but are otherwise the same.
-    if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
-                      (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
-      return false;
-
-    Type *IT1, *IT2, *JT1, *JT2;
-    getInstructionTypes(I, IT1, IT2);
-    getInstructionTypes(J, JT1, JT2);
-    unsigned MaxTypeBits = std::max(
-      IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
-      IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
-    if (!TTI && MaxTypeBits > Config.VectorBits)
-      return false;
-
-    // FIXME: handle addsub-type operations!
-
-    if (IsSimpleLoadStore) {
-      Value *IPtr, *JPtr;
-      unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
-      int64_t OffsetInElmts = 0;
-      if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-                         IAddressSpace, JAddressSpace, OffsetInElmts) &&
-          std::abs(OffsetInElmts) == 1) {
-        FixedOrder = (int) OffsetInElmts;
-        unsigned BottomAlignment = IAlignment;
-        if (OffsetInElmts < 0) BottomAlignment = JAlignment;
-
-        Type *aTypeI = isa<StoreInst>(I) ?
-          cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
-        Type *aTypeJ = isa<StoreInst>(J) ?
-          cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
-        Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
-
-        if (Config.AlignedOnly) {
-          // An aligned load or store is possible only if the instruction
-          // with the lower offset has an alignment suitable for the
-          // vector type.
-          const DataLayout &DL = I->getModule()->getDataLayout();
-          unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
-          if (BottomAlignment < VecAlignment)
-            return false;
-        }
-
-        if (TTI) {
-          unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
-                                                IAlignment, IAddressSpace);
-          unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
-                                                JAlignment, JAddressSpace);
-          unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
-                                                BottomAlignment,
-                                                IAddressSpace);
-
-          ICost += TTI->getAddressComputationCost(aTypeI);
-          JCost += TTI->getAddressComputationCost(aTypeJ);
-          VCost += TTI->getAddressComputationCost(VType);
-
-          if (VCost > ICost + JCost)
-            return false;
-
-          // We don't want to fuse to a type that will be split, even
-          // if the two input types will also be split and there is no other
-          // associated cost.
-          unsigned VParts = TTI->getNumberOfParts(VType);
-          if (VParts > 1)
-            return false;
-          else if (!VParts && VCost == ICost + JCost)
-            return false;
-
-          CostSavings = ICost + JCost - VCost;
-        }
-      } else {
-        return false;
-      }
-    } else if (TTI) {
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_AnyValue;
-      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I);
-      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J);
-      Type *VT1 = getVecTypeForPair(IT1, JT1),
-           *VT2 = getVecTypeForPair(IT2, JT2);
-
-      // On some targets (example X86) the cost of a vector shift may vary
-      // depending on whether the second operand is a Uniform or
-      // NonUniform Constant.
-      switch (I->getOpcode()) {
-      default : break;
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-
-        // If both I and J are scalar shifts by constant, then the
-        // merged vector shift count would be either a constant splat value
-        // or a non-uniform vector of constants.
-        if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) {
-          if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1)))
-            Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue :
-                               TargetTransformInfo::OK_NonUniformConstantValue;
-        } else {
-          // Check for a splat of a constant or for a non uniform vector
-          // of constants.
-          Value *IOp = I->getOperand(1);
-          Value *JOp = J->getOperand(1);
-          if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) &&
-              (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) {
-            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
-            Constant *SplatValue = cast<Constant>(IOp)->getSplatValue();
-            if (SplatValue != nullptr &&
-                SplatValue == cast<Constant>(JOp)->getSplatValue())
-              Op2VK = TargetTransformInfo::OK_UniformConstantValue;
-          }
-        }
-      }
-
-      // Note that this procedure is incorrect for insert and extract element
-      // instructions (because combining these often results in a shuffle),
-      // but this cost is ignored (because insert and extract element
-      // instructions are assigned a zero depth factor and are not really
-      // fused in general).
-      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I);
-
-      if (VCost > ICost + JCost)
-        return false;
-
-      // We don't want to fuse to a type that will be split, even
-      // if the two input types will also be split and there is no other
-      // associated cost.
-      unsigned VParts1 = TTI->getNumberOfParts(VT1),
-               VParts2 = TTI->getNumberOfParts(VT2);
-      if (VParts1 > 1 || VParts2 > 1)
-        return false;
-      else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
-        return false;
-
-      CostSavings = ICost + JCost - VCost;
-    }
-
-    // The powi,ctlz,cttz intrinsics are special because only the first
-    // argument is vectorized, the second arguments must be equal.
-    CallInst *CI = dyn_cast<CallInst>(I);
-    Function *FI;
-    if (CI && (FI = CI->getCalledFunction())) {
-      Intrinsic::ID IID = FI->getIntrinsicID();
-      if (IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-          IID == Intrinsic::cttz) {
-        Value *A1I = CI->getArgOperand(1),
-              *A1J = cast<CallInst>(J)->getArgOperand(1);
-        const SCEV *A1ISCEV = SE->getSCEV(A1I),
-                   *A1JSCEV = SE->getSCEV(A1J);
-        return (A1ISCEV == A1JSCEV);
-      }
-
-      if (IID && TTI) {
-        FastMathFlags FMFCI;
-        if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI))
-          FMFCI = FPMOCI->getFastMathFlags();
-        SmallVector<Value *, 4> IArgs(CI->arg_operands());
-        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI);
-
-        CallInst *CJ = cast<CallInst>(J);
-
-        FastMathFlags FMFCJ;
-        if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ))
-          FMFCJ = FPMOCJ->getFastMathFlags();
-
-        SmallVector<Value *, 4> JArgs(CJ->arg_operands());
-        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ);
-
-        assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
-               "Intrinsic argument counts differ");
-        SmallVector<Type*, 4> Tys;
-        SmallVector<Value *, 4> VecArgs;
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-               IID == Intrinsic::cttz) && i == 1) {
-            Tys.push_back(CI->getArgOperand(i)->getType());
-            VecArgs.push_back(CI->getArgOperand(i));
-          }
-          else {
-            Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
-                                            CJ->getArgOperand(i)->getType()));
-            // Add both operands, and then count their scalarization overhead
-            // with VF 1.
-            VecArgs.push_back(CI->getArgOperand(i));
-            VecArgs.push_back(CJ->getArgOperand(i));
-          }
-        }
-
-        // Compute the scalarization cost here with the original operands (to
-        // check for uniqueness etc), and then call getIntrinsicInstrCost()
-        // with the constructed vector types.
-        Type *RetTy = getVecTypeForPair(IT1, JT1);
-        unsigned ScalarizationCost = 0;
-        if (!RetTy->isVoidTy())
-          ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false);
-        ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1);
-
-        FastMathFlags FMFV = FMFCI;
-        FMFV &= FMFCJ;
-        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV,
-                                                    ScalarizationCost);
-
-        if (VCost > ICost + JCost)
-          return false;
-
-        // We don't want to fuse to a type that will be split, even
-        // if the two input types will also be split and there is no other
-        // associated cost.
-        unsigned RetParts = TTI->getNumberOfParts(RetTy);
-        if (RetParts > 1)
-          return false;
-        else if (!RetParts && VCost == ICost + JCost)
-          return false;
-
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          if (!Tys[i]->isVectorTy())
-            continue;
-
-          unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
-          if (NumParts > 1)
-            return false;
-          else if (!NumParts && VCost == ICost + JCost)
-            return false;
-        }
-
-        CostSavings = ICost + JCost - VCost;
-      }
-    }
-
-    return true;
-  }
-
-  // Figure out whether or not J uses I and update the users and write-set
-  // structures associated with I. Specifically, Users represents the set of
-  // instructions that depend on I. WriteSet represents the set
-  // of memory locations that are dependent on I. If UpdateUsers is true,
-  // and J uses I, then Users is updated to contain J and WriteSet is updated
-  // to contain any memory locations to which J writes. The function returns
-  // true if J uses I. By default, alias analysis is used to determine
-  // whether J reads from memory that overlaps with a location in WriteSet.
-  // If LoadMoveSet is not null, then it is a previously-computed map
-  // where the key is the memory-based user instruction and the value is
-  // the instruction to be compared with I. So, if LoadMoveSet is provided,
-  // then the alias analysis is not used. This is necessary because this
-  // function is called during the process of moving instructions during
-  // vectorization and the results of the alias analysis are not stable during
-  // that process.
-  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
-                       AliasSetTracker &WriteSet, Instruction *I,
-                       Instruction *J, bool UpdateUsers,
-                       DenseSet<ValuePair> *LoadMoveSetPairs) {
-    bool UsesI = false;
-
-    // This instruction may already be marked as a user due, for example, to
-    // being a member of a selected pair.
-    if (Users.count(J))
-      UsesI = true;
-
-    if (!UsesI)
-      for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
-           JU != JE; ++JU) {
-        Value *V = *JU;
-        if (I == V || Users.count(V)) {
-          UsesI = true;
-          break;
-        }
-      }
-    if (!UsesI && J->mayReadFromMemory()) {
-      if (LoadMoveSetPairs) {
-        UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
-      } else {
-        for (AliasSetTracker::iterator W = WriteSet.begin(),
-             WE = WriteSet.end(); W != WE; ++W) {
-          if (W->aliasesUnknownInst(J, *AA)) {
-            UsesI = true;
-            break;
-          }
-        }
-      }
-    }
-
-    if (UsesI && UpdateUsers) {
-      if (J->mayWriteToMemory()) WriteSet.add(J);
-      Users.insert(J);
-    }
-
-    return UsesI;
-  }
-
-  // This function iterates over all instruction pairs in the provided
-  // basic block and collects all candidate pairs for vectorization.
-  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
-                       BasicBlock::iterator &Start,
-                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                       DenseSet<ValuePair> &FixedOrderPairs,
-                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                       std::vector<Value *> &PairableInsts, bool NonPow2Len) {
-    size_t TotalPairs = 0;
-    BasicBlock::iterator E = BB.end();
-    if (Start == E) return false;
-
-    bool ShouldContinue = false, IAfterStart = false;
-    for (BasicBlock::iterator I = Start++; I != E; ++I) {
-      if (I == Start) IAfterStart = true;
-
-      bool IsSimpleLoadStore;
-      if (!isInstVectorizable(&*I, IsSimpleLoadStore))
-        continue;
-
-      // Look for an instruction with which to pair instruction *I...
-      DenseSet<Value *> Users;
-      AliasSetTracker WriteSet(*AA);
-      if (I->mayWriteToMemory())
-        WriteSet.add(&*I);
-
-      bool JAfterStart = IAfterStart;
-      BasicBlock::iterator J = std::next(I);
-      for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
-        if (J == Start)
-          JAfterStart = true;
-
-        // Determine if J uses I, if so, exit the loop.
-        bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);
-        if (Config.FastDep) {
-          // Note: For this heuristic to be effective, independent operations
-          // must tend to be intermixed. This is likely to be true from some
-          // kinds of grouped loop unrolling (but not the generic LLVM pass),
-          // but otherwise may require some kind of reordering pass.
-
-          // When using fast dependency analysis,
-          // stop searching after first use:
-          if (UsesI) break;
-        } else {
-          if (UsesI) continue;
-        }
-
-        // J does not use I, and comes before the first use of I, so it can be
-        // merged with I if the instructions are compatible.
-        int CostSavings, FixedOrder;
-        if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len,
-                                CostSavings, FixedOrder))
-          continue;
-
-        // J is a candidate for merging with I.
-        if (PairableInsts.empty() ||
-            PairableInsts[PairableInsts.size() - 1] != &*I) {
-          PairableInsts.push_back(&*I);
-        }
-
-        CandidatePairs[&*I].push_back(&*J);
-        ++TotalPairs;
-        if (TTI)
-          CandidatePairCostSavings.insert(
-              ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));
-
-        if (FixedOrder == 1)
-          FixedOrderPairs.insert(ValuePair(&*I, &*J));
-        else if (FixedOrder == -1)
-          FixedOrderPairs.insert(ValuePair(&*J, &*I));
-
-        // The next call to this function must start after the last instruction
-        // selected during this invocation.
-        if (JAfterStart) {
-          Start = std::next(J);
-          IAfterStart = JAfterStart = false;
-        }
-
-        DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
-                     << *I << " <-> " << *J << " (cost savings: " <<
-                     CostSavings << ")\n");
-
-        // If we have already found too many pairs, break here and this function
-        // will be called again starting after the last instruction selected
-        // during this invocation.
-        if (PairableInsts.size() >= Config.MaxInsts ||
-            TotalPairs >= Config.MaxPairs) {
-          ShouldContinue = true;
-          break;
-        }
-      }
-
-      if (ShouldContinue)
-        break;
-    }
-
-    DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
-           << " instructions with candidate pairs\n");
-
-    return ShouldContinue;
-  }
-
-  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
-  // it looks for pairs such that both members have an input which is an
-  // output of PI or PJ.
-  void BBVectorize::computePairsConnectedTo(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                  ValuePair P) {
-    StoreInst *SI, *SJ;
-
-    // For each possible pairing for this variable, look at the uses of
-    // the first value...
-    for (Value::user_iterator I = P.first->user_begin(),
-                              E = P.first->user_end();
-         I != E; ++I) {
-      User *UI = *I;
-      if (isa<LoadInst>(UI)) {
-        // A pair cannot be connected to a load because the load only takes one
-        // operand (the address) and it is a scalar even after vectorization.
-        continue;
-      } else if ((SI = dyn_cast<StoreInst>(UI)) &&
-                 P.first == SI->getPointerOperand()) {
-        // Similarly, a pair cannot be connected to a store through its
-        // pointer operand.
-        continue;
-      }
-
-      // For each use of the first variable, look for uses of the second
-      // variable...
-      for (User *UJ : P.second->users()) {
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.second == SJ->getPointerOperand())
-          continue;
-
-        // Look for <I, J>:
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
-        }
-
-        // Look for <J, I>:
-        if (CandidatePairsSet.count(ValuePair(UJ, UI))) {
-          VPPair VP(P, ValuePair(UJ, UI));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
-        }
-      }
-
-      if (Config.SplatBreaksChain) continue;
-      // Look for cases where just the first value in the pair is used by
-      // both members of another pair (splatting).
-      for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) {
-        User *UJ = *J;
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.first == SJ->getPointerOperand())
-          continue;
-
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
-        }
-      }
-    }
-
-    if (Config.SplatBreaksChain) return;
-    // Look for cases where just the second value in the pair is used by
-    // both members of another pair (splatting).
-    for (Value::user_iterator I = P.second->user_begin(),
-                              E = P.second->user_end();
-         I != E; ++I) {
-      User *UI = *I;
-      if (isa<LoadInst>(UI))
-        continue;
-      else if ((SI = dyn_cast<StoreInst>(UI)) &&
-               P.second == SI->getPointerOperand())
-        continue;
-
-      for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) {
-        User *UJ = *J;
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.second == SJ->getPointerOperand())
-          continue;
-
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
-        }
-      }
-    }
-  }
-
-  // This function figures out which pairs are connected.  Two pairs are
-  // connected if some output of the first pair forms an input to both members
-  // of the second pair.
-  void BBVectorize::computeConnectedPairs(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
-    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
-         PE = PairableInsts.end(); PI != PE; ++PI) {
-      DenseMap<Value *, std::vector<Value *> >::iterator PP =
-        CandidatePairs.find(*PI);
-      if (PP == CandidatePairs.end())
-        continue;
-
-      for (std::vector<Value *>::iterator P = PP->second.begin(),
-           E = PP->second.end(); P != E; ++P)
-        computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
-                                PairableInsts, ConnectedPairs,
-                                PairConnectionTypes, ValuePair(*PI, *P));
-    }
-
-    DEBUG(size_t TotalPairs = 0;
-          for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
-               ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
-            TotalPairs += I->second.size();
-          dbgs() << "BBV: found " << TotalPairs
-                 << " pair connections.\n");
-  }
-
-  // This function builds a set of use tuples such that <A, B> is in the set
-  // if B is in the use dag of A. If B is in the use dag of A, then B
-  // depends on the output of A.
-  void BBVectorize::buildDepMap(
-                      BasicBlock &BB,
-                      DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &PairableInstUsers) {
-    DenseSet<Value *> IsInPair;
-    for (DenseMap<Value *, std::vector<Value *> >::iterator C =
-         CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
-      IsInPair.insert(C->first);
-      IsInPair.insert(C->second.begin(), C->second.end());
-    }
-
-    // Iterate through the basic block, recording all users of each
-    // pairable instruction.
-
-    BasicBlock::iterator E = BB.end(), EL =
-      BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
-    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
-      if (IsInPair.find(&*I) == IsInPair.end())
-        continue;
-
-      DenseSet<Value *> Users;
-      AliasSetTracker WriteSet(*AA);
-      if (I->mayWriteToMemory())
-        WriteSet.add(&*I);
-
-      for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
-        (void)trackUsesOfI(Users, WriteSet, &*I, &*J);
-
-        if (J == EL)
-          break;
-      }
-
-      for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
-           U != E; ++U) {
-        if (IsInPair.find(*U) == IsInPair.end()) continue;
-        PairableInstUsers.insert(ValuePair(&*I, *U));
-      }
-
-      if (I == EL)
-        break;
-    }
-  }
-
-  // Returns true if an input to pair P is an output of pair Q and also an
-  // input of pair Q is an output of pair P. If this is the case, then these
-  // two pairs cannot be simultaneously fused.
-  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
-             DenseSet<VPPair> *PairableInstUserPairSet) {
-    // Two pairs are in conflict if they are mutual Users of eachother.
-    bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
-                  PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
-                  PairableInstUsers.count(ValuePair(P.second, Q.first))  ||
-                  PairableInstUsers.count(ValuePair(P.second, Q.second));
-    bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first,  P.first))  ||
-                  PairableInstUsers.count(ValuePair(Q.first,  P.second)) ||
-                  PairableInstUsers.count(ValuePair(Q.second, P.first))  ||
-                  PairableInstUsers.count(ValuePair(Q.second, P.second));
-    if (PairableInstUserMap) {
-      // FIXME: The expensive part of the cycle check is not so much the cycle
-      // check itself but this edge insertion procedure. This needs some
-      // profiling and probably a different data structure.
-      if (PUsesQ) {
-        if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
-          (*PairableInstUserMap)[Q].push_back(P);
-      }
-      if (QUsesP) {
-        if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
-          (*PairableInstUserMap)[P].push_back(Q);
-      }
-    }
-
-    return (QUsesP && PUsesQ);
-  }
-
-  // This function walks the use graph of current pairs to see if, starting
-  // from P, the walk returns to P.
-  bool BBVectorize::pairWillFormCycle(ValuePair P,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<ValuePair> &CurrentPairs) {
-    DEBUG(if (DebugCycleCheck)
-            dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
-                   << *P.second << "\n");
-    // A lookup table of visisted pairs is kept because the PairableInstUserMap
-    // contains non-direct associations.
-    DenseSet<ValuePair> Visited;
-    SmallVector<ValuePair, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(P);
-    do {
-      ValuePair QTop = Q.pop_back_val();
-      Visited.insert(QTop);
-
-      DEBUG(if (DebugCycleCheck)
-              dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
-                     << *QTop.second << "\n");
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        PairableInstUserMap.find(QTop);
-      if (QQ == PairableInstUserMap.end())
-        continue;
-
-      for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
-           CE = QQ->second.end(); C != CE; ++C) {
-        if (*C == P) {
-          DEBUG(dbgs()
-                 << "BBV: rejected to prevent non-trivial cycle formation: "
-                 << QTop.first << " <-> " << C->second << "\n");
-          return true;
-        }
-
-        if (CurrentPairs.count(*C) && !Visited.count(*C))
-          Q.push_back(*C);
-      }
-    } while (!Q.empty());
-
-    return false;
-  }
-
-  // This function builds the initial dag of connected pairs with the
-  // pair J at the root.
-  void BBVectorize::buildInitialDAGFor(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseSet<ValuePair> &PairableInstUsers,
-                  DenseMap<Value *, Value *> &ChosenPairs,
-                  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
-    // Each of these pairs is viewed as the root node of a DAG. The DAG
-    // is then walked (depth-first). As this happens, we keep track of
-    // the pairs that compose the DAG and the maximum depth of the DAG.
-    SmallVector<ValuePairWithDepth, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
-    do {
-      ValuePairWithDepth QTop = Q.back();
-
-      // Push each child onto the queue:
-      bool MoreChildren = false;
-      size_t MaxChildDepth = QTop.second;
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        ConnectedPairs.find(QTop.first);
-      if (QQ != ConnectedPairs.end())
-        for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
-             ke = QQ->second.end(); k != ke; ++k) {
-          // Make sure that this child pair is still a candidate:
-          if (CandidatePairsSet.count(*k)) {
-            DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
-            if (C == DAG.end()) {
-              size_t d = getDepthFactor(k->first);
-              Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
-              MoreChildren = true;
-            } else {
-              MaxChildDepth = std::max(MaxChildDepth, C->second);
-            }
-          }
-        }
-
-      if (!MoreChildren) {
-        // Record the current pair as part of the DAG:
-        DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
-        Q.pop_back();
-      }
-    } while (!Q.empty());
-  }
-
-  // Given some initial dag, prune it by removing conflicting pairs (pairs
-  // that cannot be simultaneously chosen for vectorization).
-  void BBVectorize::pruneDAGFor(
-              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-              std::vector<Value *> &PairableInsts,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-              DenseSet<ValuePair> &PairableInstUsers,
-              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-              DenseSet<VPPair> &PairableInstUserPairSet,
-              DenseMap<Value *, Value *> &ChosenPairs,
-              DenseMap<ValuePair, size_t> &DAG,
-              DenseSet<ValuePair> &PrunedDAG, ValuePair J,
-              bool UseCycleCheck) {
-    SmallVector<ValuePairWithDepth, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
-    do {
-      ValuePairWithDepth QTop = Q.pop_back_val();
-      PrunedDAG.insert(QTop.first);
-
-      // Visit each child, pruning as necessary...
-      SmallVector<ValuePairWithDepth, 8> BestChildren;
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        ConnectedPairs.find(QTop.first);
-      if (QQ == ConnectedPairs.end())
-        continue;
-
-      for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
-           KE = QQ->second.end(); K != KE; ++K) {
-        DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
-        if (C == DAG.end()) continue;
-
-        // This child is in the DAG, now we need to make sure it is the
-        // best of any conflicting children. There could be multiple
-        // conflicting children, so first, determine if we're keeping
-        // this child, then delete conflicting children as necessary.
-
-        // It is also necessary to guard against pairing-induced
-        // dependencies. Consider instructions a .. x .. y .. b
-        // such that (a,b) are to be fused and (x,y) are to be fused
-        // but a is an input to x and b is an output from y. This
-        // means that y cannot be moved after b but x must be moved
-        // after b for (a,b) to be fused. In other words, after
-        // fusing (a,b) we have y .. a/b .. x where y is an input
-        // to a/b and x is an output to a/b: x and y can no longer
-        // be legally fused. To prevent this condition, we must
-        // make sure that a child pair added to the DAG is not
-        // both an input and output of an already-selected pair.
-
-        // Pairing-induced dependencies can also form from more complicated
-        // cycles. The pair vs. pair conflicts are easy to check, and so
-        // that is done explicitly for "fast rejection", and because for
-        // child vs. child conflicts, we may prefer to keep the current
-        // pair in preference to the already-selected child.
-        DenseSet<ValuePair> CurrentPairs;
-
-        bool CanAdd = true;
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
-              = BestChildren.begin(), E2 = BestChildren.end();
-             C2 != E2; ++C2) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            if (C2->second >= C->second) {
-              CanAdd = false;
-              break;
-            }
-
-            CurrentPairs.insert(C2->first);
-          }
-        }
-        if (!CanAdd) continue;
-
-        // Even worse, this child could conflict with another node already
-        // selected for the DAG. If that is the case, ignore this child.
-        for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
-             E2 = PrunedDAG.end(); T != E2; ++T) {
-          if (T->first == C->first.first ||
-              T->first == C->first.second ||
-              T->second == C->first.first ||
-              T->second == C->first.second ||
-              pairsConflict(*T, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(*T);
-        }
-        if (!CanAdd) continue;
-
-        // And check the queue too...
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(),
-             E2 = Q.end(); C2 != E2; ++C2) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(C2->first);
-        }
-        if (!CanAdd) continue;
-
-        // Last but not least, check for a conflict with any of the
-        // already-chosen pairs.
-        for (DenseMap<Value *, Value *>::iterator C2 =
-              ChosenPairs.begin(), E2 = ChosenPairs.end();
-             C2 != E2; ++C2) {
-          if (pairsConflict(*C2, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(*C2);
-        }
-        if (!CanAdd) continue;
-
-        // To check for non-trivial cycles formed by the addition of the
-        // current pair we've formed a list of all relevant pairs, now use a
-        // graph walk to check for a cycle. We start from the current pair and
-        // walk the use dag to see if we again reach the current pair. If we
-        // do, then the current pair is rejected.
-
-        // FIXME: It may be more efficient to use a topological-ordering
-        // algorithm to improve the cycle check. This should be investigated.
-        if (UseCycleCheck &&
-            pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
-          continue;
-
-        // This child can be added, but we may have chosen it in preference
-        // to an already-selected child. Check for this here, and if a
-        // conflict is found, then remove the previously-selected child
-        // before adding this one in its place.
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
-              = BestChildren.begin(); C2 != BestChildren.end();) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers))
-            C2 = BestChildren.erase(C2);
-          else
-            ++C2;
-        }
-
-        BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
-      }
-
-      for (SmallVectorImpl<ValuePairWithDepth>::iterator C
-            = BestChildren.begin(), E2 = BestChildren.end();
-           C != E2; ++C) {
-        size_t DepthF = getDepthFactor(C->first.first);
-        Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
-      }
-    } while (!Q.empty());
-  }
-
-  // This function finds the best dag of mututally-compatible connected
-  // pairs, given the choice of root pairs as an iterator range.
-  void BBVectorize::findBestDAGFor(
-              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-              DenseSet<ValuePair> &CandidatePairsSet,
-              DenseMap<ValuePair, int> &CandidatePairCostSavings,
-              std::vector<Value *> &PairableInsts,
-              DenseSet<ValuePair> &FixedOrderPairs,
-              DenseMap<VPPair, unsigned> &PairConnectionTypes,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-              DenseSet<ValuePair> &PairableInstUsers,
-              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-              DenseSet<VPPair> &PairableInstUserPairSet,
-              DenseMap<Value *, Value *> &ChosenPairs,
-              DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
-              int &BestEffSize, Value *II, std::vector<Value *>&JJ,
-              bool UseCycleCheck) {
-    for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
-         J != JE; ++J) {
-      ValuePair IJ(II, *J);
-      if (!CandidatePairsSet.count(IJ))
-        continue;
-
-      // Before going any further, make sure that this pair does not
-      // conflict with any already-selected pairs (see comment below
-      // near the DAG pruning for more details).
-      DenseSet<ValuePair> ChosenPairSet;
-      bool DoesConflict = false;
-      for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
-           E = ChosenPairs.end(); C != E; ++C) {
-        if (pairsConflict(*C, IJ, PairableInstUsers,
-                          UseCycleCheck ? &PairableInstUserMap : nullptr,
-                          UseCycleCheck ? &PairableInstUserPairSet : nullptr)) {
-          DoesConflict = true;
-          break;
-        }
-
-        ChosenPairSet.insert(*C);
-      }
-      if (DoesConflict) continue;
-
-      if (UseCycleCheck &&
-          pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
-        continue;
-
-      DenseMap<ValuePair, size_t> DAG;
-      buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
-                          PairableInsts, ConnectedPairs,
-                          PairableInstUsers, ChosenPairs, DAG, IJ);
-
-      // Because we'll keep the child with the largest depth, the largest
-      // depth is still the same in the unpruned DAG.
-      size_t MaxDepth = DAG.lookup(IJ);
-
-      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
-                   << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
-                   MaxDepth << " and size " << DAG.size() << "\n");
-
-      // At this point the DAG has been constructed, but, may contain
-      // contradictory children (meaning that different children of
-      // some dag node may be attempting to fuse the same instruction).
-      // So now we walk the dag again, in the case of a conflict,
-      // keep only the child with the largest depth. To break a tie,
-      // favor the first child.
-
-      DenseSet<ValuePair> PrunedDAG;
-      pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
-                   PairableInstUsers, PairableInstUserMap,
-                   PairableInstUserPairSet,
-                   ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
-
-      int EffSize = 0;
-      if (TTI) {
-        DenseSet<Value *> PrunedDAGInstrs;
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S) {
-          PrunedDAGInstrs.insert(S->first);
-          PrunedDAGInstrs.insert(S->second);
-        }
-
-        // The set of pairs that have already contributed to the total cost.
-        DenseSet<ValuePair> IncomingPairs;
-
-        // If the cost model were perfect, this might not be necessary; but we
-        // need to make sure that we don't get stuck vectorizing our own
-        // shuffle chains.
-        bool HasNontrivialInsts = false;
-
-        // The node weights represent the cost savings associated with
-        // fusing the pair of instructions.
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S) {
-          if (!isa<ShuffleVectorInst>(S->first) &&
-              !isa<InsertElementInst>(S->first) &&
-              !isa<ExtractElementInst>(S->first))
-            HasNontrivialInsts = true;
-
-          bool FlipOrder = false;
-
-          if (getDepthFactor(S->first)) {
-            int ESContrib = CandidatePairCostSavings.find(*S)->second;
-            DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
-                   << *S->first << " <-> " << *S->second << "} = " <<
-                   ESContrib << "\n");
-            EffSize += ESContrib;
-          }
-
-          // The edge weights contribute in a negative sense: they represent
-          // the cost of shuffles.
-          DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
-            ConnectedPairDeps.find(*S);
-          if (SS != ConnectedPairDeps.end()) {
-            unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
-                 TE = SS->second.end(); T != TE; ++T) {
-              VPPair Q(*S, *T);
-              if (!PrunedDAG.count(Q.second))
-                continue;
-              DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q.second, Q.first));
-              assert(R != PairConnectionTypes.end() &&
-                     "Cannot find pair connection type");
-              if (R->second == PairConnectionDirect)
-                ++NumDepsDirect;
-              else if (R->second == PairConnectionSwap)
-                ++NumDepsSwap;
-            }
-
-            // If there are more swaps than direct connections, then
-            // the pair order will be flipped during fusion. So the real
-            // number of swaps is the minimum number.
-            FlipOrder = !FixedOrderPairs.count(*S) &&
-              ((NumDepsSwap > NumDepsDirect) ||
-                FixedOrderPairs.count(ValuePair(S->second, S->first)));
-
-            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
-                 TE = SS->second.end(); T != TE; ++T) {
-              VPPair Q(*S, *T);
-              if (!PrunedDAG.count(Q.second))
-                continue;
-              DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q.second, Q.first));
-              assert(R != PairConnectionTypes.end() &&
-                     "Cannot find pair connection type");
-              Type *Ty1 = Q.second.first->getType(),
-                   *Ty2 = Q.second.second->getType();
-              Type *VTy = getVecTypeForPair(Ty1, Ty2);
-              if ((R->second == PairConnectionDirect && FlipOrder) ||
-                  (R->second == PairConnectionSwap && !FlipOrder)  ||
-                  R->second == PairConnectionSplat) {
-                int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                                   VTy, VTy);
-
-                if (VTy->getVectorNumElements() == 2) {
-                  if (R->second == PairConnectionSplat)
-                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                      TargetTransformInfo::SK_Broadcast, VTy));
-                  else
-                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                      TargetTransformInfo::SK_Reverse, VTy));
-                }
-
-                DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                  *Q.second.first << " <-> " << *Q.second.second <<
-                    "} -> {" <<
-                  *S->first << " <-> " << *S->second << "} = " <<
-                   ESContrib << "\n");
-                EffSize -= ESContrib;
-              }
-            }
-          }
-
-          // Compute the cost of outgoing edges. We assume that edges outgoing
-          // to shuffles, inserts or extracts can be merged, and so contribute
-          // no additional cost.
-          if (!S->first->getType()->isVoidTy()) {
-            Type *Ty1 = S->first->getType(),
-                 *Ty2 = S->second->getType();
-            Type *VTy = getVecTypeForPair(Ty1, Ty2);
-
-            bool NeedsExtraction = false;
-            for (User *U : S->first->users()) {
-              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
-                // Shuffle can be folded if it has no other input
-                if (isa<UndefValue>(SI->getOperand(1)))
-                  continue;
-              }
-              if (isa<ExtractElementInst>(U))
-                continue;
-              if (PrunedDAGInstrs.count(U))
-                continue;
-              NeedsExtraction = true;
-              break;
-            }
-
-            if (NeedsExtraction) {
-              int ESContrib;
-              if (Ty1->isVectorTy()) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               Ty1, VTy);
-                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                  TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
-              } else
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::ExtractElement, VTy, 0);
-
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                *S->first << "} = " << ESContrib << "\n");
-              EffSize -= ESContrib;
-            }
-
-            NeedsExtraction = false;
-            for (User *U : S->second->users()) {
-              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
-                // Shuffle can be folded if it has no other input
-                if (isa<UndefValue>(SI->getOperand(1)))
-                  continue;
-              }
-              if (isa<ExtractElementInst>(U))
-                continue;
-              if (PrunedDAGInstrs.count(U))
-                continue;
-              NeedsExtraction = true;
-              break;
-            }
-
-            if (NeedsExtraction) {
-              int ESContrib;
-              if (Ty2->isVectorTy()) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               Ty2, VTy);
-                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                  TargetTransformInfo::SK_ExtractSubvector, VTy,
-                  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
-              } else
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::ExtractElement, VTy, 1);
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                *S->second << "} = " << ESContrib << "\n");
-              EffSize -= ESContrib;
-            }
-          }
-
-          // Compute the cost of incoming edges.
-          if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
-            Instruction *S1 = cast<Instruction>(S->first),
-                        *S2 = cast<Instruction>(S->second);
-            for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
-              Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
-
-              // Combining constants into vector constants (or small vector
-              // constants into larger ones are assumed free).
-              if (isa<Constant>(O1) && isa<Constant>(O2))
-                continue;
-
-              if (FlipOrder)
-                std::swap(O1, O2);
-
-              ValuePair VP  = ValuePair(O1, O2);
-              ValuePair VPR = ValuePair(O2, O1);
-
-              // Internal edges are not handled here.
-              if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
-                continue;
-
-              Type *Ty1 = O1->getType(),
-                   *Ty2 = O2->getType();
-              Type *VTy = getVecTypeForPair(Ty1, Ty2);
-
-              // Combining vector operations of the same type is also assumed
-              // folded with other operations.
-              if (Ty1 == Ty2) {
-                // If both are insert elements, then both can be widened.
-                InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
-                                  *IEO2 = dyn_cast<InsertElementInst>(O2);
-                if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
-                  continue;
-                // If both are extract elements, and both have the same input
-                // type, then they can be replaced with a shuffle
-                ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
-                                   *EIO2 = dyn_cast<ExtractElementInst>(O2);
-                if (EIO1 && EIO2 &&
-                    EIO1->getOperand(0)->getType() ==
-                      EIO2->getOperand(0)->getType())
-                  continue;
-                // If both are a shuffle with equal operand types and only two
-                // unqiue operands, then they can be replaced with a single
-                // shuffle
-                ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
-                                  *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
-                if (SIO1 && SIO2 &&
-                    SIO1->getOperand(0)->getType() ==
-                      SIO2->getOperand(0)->getType()) {
-                  SmallSet<Value *, 4> SIOps;
-                  SIOps.insert(SIO1->getOperand(0));
-                  SIOps.insert(SIO1->getOperand(1));
-                  SIOps.insert(SIO2->getOperand(0));
-                  SIOps.insert(SIO2->getOperand(1));
-                  if (SIOps.size() <= 2)
-                    continue;
-                }
-              }
-
-              int ESContrib;
-              // This pair has already been formed.
-              if (IncomingPairs.count(VP)) {
-                continue;
-              } else if (IncomingPairs.count(VPR)) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               VTy, VTy);
-
-                if (VTy->getVectorNumElements() == 2)
-                  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                    TargetTransformInfo::SK_Reverse, VTy));
-              } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, VTy, 0);
-                ESContrib += (int) TTI->getVectorInstrCost(
-                                     Instruction::InsertElement, VTy, 1);
-              } else if (!Ty1->isVectorTy()) {
-                // O1 needs to be inserted into a vector of size O2, and then
-                // both need to be shuffled together.
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, Ty2, 0);
-                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                VTy, Ty2);
-              } else if (!Ty2->isVectorTy()) {
-                // O2 needs to be inserted into a vector of size O1, and then
-                // both need to be shuffled together.
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, Ty1, 0);
-                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                VTy, Ty1);
-              } else {
-                Type *TyBig = Ty1, *TySmall = Ty2;
-                if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
-                  std::swap(TyBig, TySmall);
-
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               VTy, TyBig);
-                if (TyBig != TySmall)
-                  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                  TyBig, TySmall);
-              }
-
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
-                     << *O1 << " <-> " << *O2 << "} = " <<
-                     ESContrib << "\n");
-              EffSize -= ESContrib;
-              IncomingPairs.insert(VP);
-            }
-          }
-        }
-
-        if (!HasNontrivialInsts) {
-          DEBUG(if (DebugPairSelection) dbgs() <<
-                "\tNo non-trivial instructions in DAG;"
-                " override to zero effective size\n");
-          EffSize = 0;
-        }
-      } else {
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S)
-          EffSize += (int) getDepthFactor(S->first);
-      }
-
-      DEBUG(if (DebugPairSelection)
-             dbgs() << "BBV: found pruned DAG for pair {"
-             << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
-             MaxDepth << " and size " << PrunedDAG.size() <<
-            " (effective size: " << EffSize << ")\n");
-      if (((TTI && !UseChainDepthWithTI) ||
-            MaxDepth >= Config.ReqChainDepth) &&
-          EffSize > 0 && EffSize > BestEffSize) {
-        BestMaxDepth = MaxDepth;
-        BestEffSize = EffSize;
-        BestDAG = PrunedDAG;
-      }
-    }
-  }
-
-  // Given the list of candidate pairs, this function selects those
-  // that will be fused into vector instructions.
-  void BBVectorize::choosePairs(
-                DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                DenseSet<ValuePair> &CandidatePairsSet,
-                DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                std::vector<Value *> &PairableInsts,
-                DenseSet<ValuePair> &FixedOrderPairs,
-                DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-                DenseSet<ValuePair> &PairableInstUsers,
-                DenseMap<Value *, Value *>& ChosenPairs) {
-    bool UseCycleCheck =
-     CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
-
-    DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
-    for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
-         E = CandidatePairsSet.end(); I != E; ++I) {
-      std::vector<Value *> &JJ = CandidatePairs2[I->second];
-      if (JJ.empty()) JJ.reserve(32);
-      JJ.push_back(I->first);
-    }
-
-    DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
-    DenseSet<VPPair> PairableInstUserPairSet;
-    for (std::vector<Value *>::iterator I = PairableInsts.begin(),
-         E = PairableInsts.end(); I != E; ++I) {
-      // The number of possible pairings for this variable:
-      size_t NumChoices = CandidatePairs.lookup(*I).size();
-      if (!NumChoices) continue;
-
-      std::vector<Value *> &JJ = CandidatePairs[*I];
-
-      // The best pair to choose and its dag:
-      size_t BestMaxDepth = 0;
-      int BestEffSize = 0;
-      DenseSet<ValuePair> BestDAG;
-      findBestDAGFor(CandidatePairs, CandidatePairsSet,
-                      CandidatePairCostSavings,
-                      PairableInsts, FixedOrderPairs, PairConnectionTypes,
-                      ConnectedPairs, ConnectedPairDeps,
-                      PairableInstUsers, PairableInstUserMap,
-                      PairableInstUserPairSet, ChosenPairs,
-                      BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
-                      UseCycleCheck);
-
-      if (BestDAG.empty())
-        continue;
-
-      // A dag has been chosen (or not) at this point. If no dag was
-      // chosen, then this instruction, I, cannot be paired (and is no longer
-      // considered).
-
-      DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
-                   << *cast<Instruction>(*I) << "\n");
-
-      for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
-           SE2 = BestDAG.end(); S != SE2; ++S) {
-        // Insert the members of this dag into the list of chosen pairs.
-        ChosenPairs.insert(ValuePair(S->first, S->second));
-        DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
-               *S->second << "\n");
-
-        // Remove all candidate pairs that have values in the chosen dag.
-        std::vector<Value *> &KK = CandidatePairs[S->first];
-        for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
-             K != KE; ++K) {
-          if (*K == S->second)
-            continue;
-
-          CandidatePairsSet.erase(ValuePair(S->first, *K));
-        }
-
-        std::vector<Value *> &LL = CandidatePairs2[S->second];
-        for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
-             L != LE; ++L) {
-          if (*L == S->first)
-            continue;
-
-          CandidatePairsSet.erase(ValuePair(*L, S->second));
-        }
-
-        std::vector<Value *> &MM = CandidatePairs[S->second];
-        for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
-             M != ME; ++M) {
-          assert(*M != S->first && "Flipped pair in candidate list?");
-          CandidatePairsSet.erase(ValuePair(S->second, *M));
-        }
-
-        std::vector<Value *> &NN = CandidatePairs2[S->first];
-        for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
-             N != NE; ++N) {
-          assert(*N != S->second && "Flipped pair in candidate list?");
-          CandidatePairsSet.erase(ValuePair(*N, S->first));
-        }
-      }
-    }
-
-    DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
-  }
-
-  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
-                     unsigned n = 0) {
-    if (!I->hasName())
-      return "";
-
-    return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
-             (n > 0 ? "." + utostr(n) : "")).str();
-  }
-
-  // Returns the value that is to be used as the pointer input to the vector
-  // instruction that fuses I with J.
-  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
-                     Instruction *I, Instruction *J, unsigned o) {
-    Value *IPtr, *JPtr;
-    unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
-    int64_t OffsetInElmts;
-
-    // Note: the analysis might fail here, that is why the pair order has
-    // been precomputed (OffsetInElmts must be unused here).
-    (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-                          IAddressSpace, JAddressSpace,
-                          OffsetInElmts, false);
-
-    // The pointer value is taken to be the one with the lowest offset.
-    Value *VPtr = IPtr;
-
-    Type *ArgTypeI = IPtr->getType()->getPointerElementType();
-    Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
-    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-    Type *VArgPtrType
-      = PointerType::get(VArgType,
-                         IPtr->getType()->getPointerAddressSpace());
-    return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
-                        /* insert before */ I);
-  }
-
-  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned MaskOffset, unsigned NumInElem,
-                     unsigned NumInElem1, unsigned IdxOffset,
-                     std::vector<Constant*> &Mask) {
-    unsigned NumElem1 = J->getType()->getVectorNumElements();
-    for (unsigned v = 0; v < NumElem1; ++v) {
-      int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
-      if (m < 0) {
-        Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
-      } else {
-        unsigned mm = m + (int) IdxOffset;
-        if (m >= (int) NumInElem1)
-          mm += (int) NumInElem;
-
-        Mask[v+MaskOffset] =
-          ConstantInt::get(Type::getInt32Ty(Context), mm);
-      }
-    }
-  }
-
-  // Returns the value that is to be used as the vector-shuffle mask to the
-  // vector instruction that fuses I with J.
-  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
-                     Instruction *I, Instruction *J) {
-    // This is the shuffle mask. We need to append the second
-    // mask to the first, and the numbers need to be adjusted.
-
-    Type *ArgTypeI = I->getType();
-    Type *ArgTypeJ = J->getType();
-    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-    unsigned NumElemI = ArgTypeI->getVectorNumElements();
-
-    // Get the total number of elements in the fused vector type.
-    // By definition, this must equal the number of elements in
-    // the final mask.
-    unsigned NumElem = VArgType->getVectorNumElements();
-    std::vector<Constant*> Mask(NumElem);
-
-    Type *OpTypeI = I->getOperand(0)->getType();
-    unsigned NumInElemI = OpTypeI->getVectorNumElements();
-    Type *OpTypeJ = J->getOperand(0)->getType();
-    unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
-
-    // The fused vector will be:
-    // -----------------------------------------------------
-    // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
-    // -----------------------------------------------------
-    // from which we'll extract NumElem total elements (where the first NumElemI
-    // of them come from the mask in I and the remainder come from the mask
-    // in J.
-
-    // For the mask from the first pair...
-    fillNewShuffleMask(Context, I, 0,        NumInElemJ, NumInElemI,
-                       0,          Mask);
-
-    // For the mask from the second pair...
-    fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
-                       NumInElemI, Mask);
-
-    return ConstantVector::get(Mask);
-  }
-
-  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
-                                  Instruction *J, unsigned o, Value *&LOp,
-                                  unsigned numElemL,
-                                  Type *ArgTypeL, Type *ArgTypeH,
-                                  bool IBeforeJ, unsigned IdxOff) {
-    bool ExpandedIEChain = false;
-    if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
-      // If we have a pure insertelement chain, then this can be rewritten
-      // into a chain that directly builds the larger type.
-      if (isPureIEChain(LIE)) {
-        SmallVector<Value *, 8> VectElemts(numElemL,
-          UndefValue::get(ArgTypeL->getScalarType()));
-        InsertElementInst *LIENext = LIE;
-        do {
-          unsigned Idx =
-            cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
-          VectElemts[Idx] = LIENext->getOperand(1);
-        } while ((LIENext =
-                   dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
-
-        LIENext = nullptr;
-        Value *LIEPrev = UndefValue::get(ArgTypeH);
-        for (unsigned i = 0; i < numElemL; ++i) {
-          if (isa<UndefValue>(VectElemts[i])) continue;
-          LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
-                             ConstantInt::get(Type::getInt32Ty(Context),
-                                              i + IdxOff),
-                             getReplacementName(IBeforeJ ? I : J,
-                                                true, o, i+1));
-          LIENext->insertBefore(IBeforeJ ? J : I);
-          LIEPrev = LIENext;
-        }
-
-        LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
-        ExpandedIEChain = true;
-      }
-    }
-
-    return ExpandedIEChain;
-  }
-
-  static unsigned getNumScalarElements(Type *Ty) {
-    if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
-      return VecTy->getNumElements();
-    return 1;
-  }
-
-  // Returns the value to be used as the specified operand of the vector
-  // instruction that fuses I with J.
-  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool IBeforeJ) {
-    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
-
-    // Compute the fused vector type for this operand
-    Type *ArgTypeI = I->getOperand(o)->getType();
-    Type *ArgTypeJ = J->getOperand(o)->getType();
-    VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-    Instruction *L = I, *H = J;
-    Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
-
-    unsigned numElemL = getNumScalarElements(ArgTypeL);
-    unsigned numElemH = getNumScalarElements(ArgTypeH);
-
-    Value *LOp = L->getOperand(o);
-    Value *HOp = H->getOperand(o);
-    unsigned numElem = VArgType->getNumElements();
-
-    // First, we check if we can reuse the "original" vector outputs (if these
-    // exist). We might need a shuffle.
-    ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
-    ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
-    ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
-    ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
-
-    // FIXME: If we're fusing shuffle instructions, then we can't apply this
-    // optimization. The input vectors to the shuffle might be a different
-    // length from the shuffle outputs. Unfortunately, the replacement
-    // shuffle mask has already been formed, and the mask entries are sensitive
-    // to the sizes of the inputs.
-    bool IsSizeChangeShuffle =
-      isa<ShuffleVectorInst>(L) &&
-        (LOp->getType() != L->getType() || HOp->getType() != H->getType());
-
-    if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
-      // We can have at most two unique vector inputs.
-      bool CanUseInputs = true;
-      Value *I1, *I2 = nullptr;
-      if (LEE) {
-        I1 = LEE->getOperand(0);
-      } else {
-        I1 = LSV->getOperand(0);
-        I2 = LSV->getOperand(1);
-        if (I2 == I1 || isa<UndefValue>(I2))
-          I2 = nullptr;
-      }
-
-      if (HEE) {
-        Value *I3 = HEE->getOperand(0);
-        if (!I2 && I3 != I1)
-          I2 = I3;
-        else if (I3 != I1 && I3 != I2)
-          CanUseInputs = false;
-      } else {
-        Value *I3 = HSV->getOperand(0);
-        if (!I2 && I3 != I1)
-          I2 = I3;
-        else if (I3 != I1 && I3 != I2)
-          CanUseInputs = false;
-
-        if (CanUseInputs) {
-          Value *I4 = HSV->getOperand(1);
-          if (!isa<UndefValue>(I4)) {
-            if (!I2 && I4 != I1)
-              I2 = I4;
-            else if (I4 != I1 && I4 != I2)
-              CanUseInputs = false;
-          }
-        }
-      }
-
-      if (CanUseInputs) {
-        unsigned LOpElem =
-          cast<Instruction>(LOp)->getOperand(0)->getType()
-            ->getVectorNumElements();
-
-        unsigned HOpElem =
-          cast<Instruction>(HOp)->getOperand(0)->getType()
-            ->getVectorNumElements();
-
-        // We have one or two input vectors. We need to map each index of the
-        // operands to the index of the original vector.
-        SmallVector<std::pair<int, int>, 8>  II(numElem);
-        for (unsigned i = 0; i < numElemL; ++i) {
-          int Idx, INum;
-          if (LEE) {
-            Idx =
-              cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
-            INum = LEE->getOperand(0) == I1 ? 0 : 1;
-          } else {
-            Idx = LSV->getMaskValue(i);
-            if (Idx < (int) LOpElem) {
-              INum = LSV->getOperand(0) == I1 ? 0 : 1;
-            } else {
-              Idx -= LOpElem;
-              INum = LSV->getOperand(1) == I1 ? 0 : 1;
-            }
-          }
-
-          II[i] = std::pair<int, int>(Idx, INum);
-        }
-        for (unsigned i = 0; i < numElemH; ++i) {
-          int Idx, INum;
-          if (HEE) {
-            Idx =
-              cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
-            INum = HEE->getOperand(0) == I1 ? 0 : 1;
-          } else {
-            Idx = HSV->getMaskValue(i);
-            if (Idx < (int) HOpElem) {
-              INum = HSV->getOperand(0) == I1 ? 0 : 1;
-            } else {
-              Idx -= HOpElem;
-              INum = HSV->getOperand(1) == I1 ? 0 : 1;
-            }
-          }
-
-          II[i + numElemL] = std::pair<int, int>(Idx, INum);
-        }
-
-        // We now have an array which tells us from which index of which
-        // input vector each element of the operand comes.
-        VectorType *I1T = cast<VectorType>(I1->getType());
-        unsigned I1Elem = I1T->getNumElements();
-
-        if (!I2) {
-          // In this case there is only one underlying vector input. Check for
-          // the trivial case where we can use the input directly.
-          if (I1Elem == numElem) {
-            bool ElemInOrder = true;
-            for (unsigned i = 0; i < numElem; ++i) {
-              if (II[i].first != (int) i && II[i].first != -1) {
-                ElemInOrder = false;
-                break;
-              }
-            }
-
-            if (ElemInOrder)
-              return I1;
-          }
-
-          // A shuffle is needed.
-          std::vector<Constant *> Mask(numElem);
-          for (unsigned i = 0; i < numElem; ++i) {
-            int Idx = II[i].first;
-            if (Idx == -1)
-              Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
-            else
-              Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-          }
-
-          Instruction *S =
-            new ShuffleVectorInst(I1, UndefValue::get(I1T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o));
-          S->insertBefore(IBeforeJ ? J : I);
-          return S;
-        }
-
-        VectorType *I2T = cast<VectorType>(I2->getType());
-        unsigned I2Elem = I2T->getNumElements();
-
-        // This input comes from two distinct vectors. The first step is to
-        // make sure that both vectors are the same length. If not, the
-        // smaller one will need to grow before they can be shuffled together.
-        if (I1Elem < I2Elem) {
-          std::vector<Constant *> Mask(I2Elem);
-          unsigned v = 0;
-          for (; v < I1Elem; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < I2Elem; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          Instruction *NewI1 =
-            new ShuffleVectorInst(I1, UndefValue::get(I1T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o, 1));
-          NewI1->insertBefore(IBeforeJ ? J : I);
-          I1 = NewI1;
-          I1Elem = I2Elem;
-        } else if (I1Elem > I2Elem) {
-          std::vector<Constant *> Mask(I1Elem);
-          unsigned v = 0;
-          for (; v < I2Elem; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < I1Elem; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          Instruction *NewI2 =
-            new ShuffleVectorInst(I2, UndefValue::get(I2T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o, 1));
-          NewI2->insertBefore(IBeforeJ ? J : I);
-          I2 = NewI2;
-        }
-
-        // Now that both I1 and I2 are the same length we can shuffle them
-        // together (and use the result).
-        std::vector<Constant *> Mask(numElem);
-        for (unsigned v = 0; v < numElem; ++v) {
-          if (II[v].first == -1) {
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-          } else {
-            int Idx = II[v].first + II[v].second * I1Elem;
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-          }
-        }
-
-        Instruction *NewOp =
-          new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
-                                getReplacementName(IBeforeJ ? I : J, true, o));
-        NewOp->insertBefore(IBeforeJ ? J : I);
-        return NewOp;
-      }
-    }
-
-    Type *ArgType = ArgTypeL;
-    if (numElemL < numElemH) {
-      if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
-                                         ArgTypeL, VArgType, IBeforeJ, 1)) {
-        // This is another short-circuit case: we're combining a scalar into
-        // a vector that is formed by an IE chain. We've just expanded the IE
-        // chain, now insert the scalar and we're done.
-
-        Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
-                           getReplacementName(IBeforeJ ? I : J, true, o));
-        S->insertBefore(IBeforeJ ? J : I);
-        return S;
-      } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
-                                ArgTypeH, IBeforeJ)) {
-        // The two vector inputs to the shuffle must be the same length,
-        // so extend the smaller vector to be the same length as the larger one.
-        Instruction *NLOp;
-        if (numElemL > 1) {
-
-          std::vector<Constant *> Mask(numElemH);
-          unsigned v = 0;
-          for (; v < numElemL; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < numElemH; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
-                                       ConstantVector::get(Mask),
-                                       getReplacementName(IBeforeJ ? I : J,
-                                                          true, o, 1));
-        } else {
-          NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
-                                           getReplacementName(IBeforeJ ? I : J,
-                                                              true, o, 1));
-        }
-
-        NLOp->insertBefore(IBeforeJ ? J : I);
-        LOp = NLOp;
-      }
-
-      ArgType = ArgTypeH;
-    } else if (numElemL > numElemH) {
-      if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
-                                         ArgTypeH, VArgType, IBeforeJ)) {
-        Instruction *S =
-          InsertElementInst::Create(LOp, HOp,
-                                    ConstantInt::get(Type::getInt32Ty(Context),
-                                                     numElemL),
-                                    getReplacementName(IBeforeJ ? I : J,
-                                                       true, o));
-        S->insertBefore(IBeforeJ ? J : I);
-        return S;
-      } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
-                                ArgTypeL, IBeforeJ)) {
-        Instruction *NHOp;
-        if (numElemH > 1) {
-          std::vector<Constant *> Mask(numElemL);
-          unsigned v = 0;
-          for (; v < numElemH; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < numElemL; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
-                                       ConstantVector::get(Mask),
-                                       getReplacementName(IBeforeJ ? I : J,
-                                                          true, o, 1));
-        } else {
-          NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
-                                           getReplacementName(IBeforeJ ? I : J,
-                                                              true, o, 1));
-        }
-
-        NHOp->insertBefore(IBeforeJ ? J : I);
-        HOp = NHOp;
-      }
-    }
-
-    if (ArgType->isVectorTy()) {
-      unsigned numElem = VArgType->getVectorNumElements();
-      std::vector<Constant*> Mask(numElem);
-      for (unsigned v = 0; v < numElem; ++v) {
-        unsigned Idx = v;
-        // If the low vector was expanded, we need to skip the extra
-        // undefined entries.
-        if (v >= numElemL && numElemH > numElemL)
-          Idx += (numElemH - numElemL);
-        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-      }
-
-      Instruction *BV = new ShuffleVectorInst(LOp, HOp,
-                          ConstantVector::get(Mask),
-                          getReplacementName(IBeforeJ ? I : J, true, o));
-      BV->insertBefore(IBeforeJ ? J : I);
-      return BV;
-    }
-
-    Instruction *BV1 = InsertElementInst::Create(
-                                          UndefValue::get(VArgType), LOp, CV0,
-                                          getReplacementName(IBeforeJ ? I : J,
-                                                             true, o, 1));
-    BV1->insertBefore(IBeforeJ ? J : I);
-    Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
-                                          getReplacementName(IBeforeJ ? I : J,
-                                                             true, o, 2));
-    BV2->insertBefore(IBeforeJ ? J : I);
-    return BV2;
-  }
-
-  // This function creates an array of values that will be used as the inputs
-  // to the vector instruction that fuses I with J.
-  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
-                     Instruction *I, Instruction *J,
-                     SmallVectorImpl<Value *> &ReplacedOperands,
-                     bool IBeforeJ) {
-    unsigned NumOperands = I->getNumOperands();
-
-    for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
-      // Iterate backward so that we look at the store pointer
-      // first and know whether or not we need to flip the inputs.
-
-      if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
-        // This is the pointer for a load/store instruction.
-        ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
-        continue;
-      } else if (isa<CallInst>(I)) {
-        Function *F = cast<CallInst>(I)->getCalledFunction();
-        Intrinsic::ID IID = F->getIntrinsicID();
-        if (o == NumOperands-1) {
-          BasicBlock &BB = *I->getParent();
-
-          Module *M = BB.getParent()->getParent();
-          Type *ArgTypeI = I->getType();
-          Type *ArgTypeJ = J->getType();
-          Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-          ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
-          continue;
-        } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-                    IID == Intrinsic::cttz) && o == 1) {
-          // The second argument of powi/ctlz/cttz is a single integer/constant
-          // and we've already checked that both arguments are equal.
-          // As a result, we just keep I's second argument.
-          ReplacedOperands[o] = I->getOperand(o);
-          continue;
-        }
-      } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
-        ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
-        continue;
-      }
-
-      ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
-    }
-  }
-
-  // This function creates two values that represent the outputs of the
-  // original I and J instructions. These are generally vector shuffles
-  // or extracts. In many cases, these will end up being unused and, thus,
-  // eliminated by later passes.
-  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, Instruction *K,
-                     Instruction *&InsertionPt,
-                     Instruction *&K1, Instruction *&K2) {
-    if (isa<StoreInst>(I))
-      return;
-
-    Type *IType = I->getType();
-    Type *JType = J->getType();
-
-    VectorType *VType = getVecTypeForPair(IType, JType);
-    unsigned numElem = VType->getNumElements();
-
-    unsigned numElemI = getNumScalarElements(IType);
-    unsigned numElemJ = getNumScalarElements(JType);
-
-    if (IType->isVectorTy()) {
-      std::vector<Constant *> Mask1(numElemI), Mask2(numElemI);
-      for (unsigned v = 0; v < numElemI; ++v) {
-        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);
-      }
-
-      K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                 ConstantVector::get(Mask1),
-                                 getReplacementName(K, false, 1));
-    } else {
-      Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-      K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1));
-    }
-
-    if (JType->isVectorTy()) {
-      std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ);
-      for (unsigned v = 0; v < numElemJ; ++v) {
-        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);
-      }
-
-      K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                 ConstantVector::get(Mask2),
-                                 getReplacementName(K, false, 2));
-    } else {
-      Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1);
-      K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));
-    }
-
-    K1->insertAfter(K);
-    K2->insertAfter(K1);
-    InsertionPt = K2;
-  }
-
-  // Move all uses of the function I (including pairing-induced uses) after J.
-  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I, Instruction *J) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    for (; cast<Instruction>(L) != J; ++L)
-      (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);
-
-    assert(cast<Instruction>(L) == J &&
-      "Tracking has not proceeded far enough to check for dependencies");
-    // If J is now in the use set of I, then trackUsesOfI will return true
-    // and we have a dependency cycle (and the fusing operation must abort).
-    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
-  }
-
-  // Move all uses of the function I (including pairing-induced uses) after J.
-  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *&InsertionPt,
-                     Instruction *I, Instruction *J) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    for (; cast<Instruction>(L) != J;) {
-      if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) {
-        // Move this instruction
-        Instruction *InstToMove = &*L++;
-
-        DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
-                        " to after " << *InsertionPt << "\n");
-        InstToMove->removeFromParent();
-        InstToMove->insertAfter(InsertionPt);
-        InsertionPt = InstToMove;
-      } else {
-        ++L;
-      }
-    }
-  }
-
-  // Collect all load instruction that are in the move set of a given first
-  // pair member.  These loads depend on the first instruction, I, and so need
-  // to be moved after J (the second instruction) when the pair is fused.
-  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    // Note: We cannot end the loop when we reach J because J could be moved
-    // farther down the use chain by another instruction pairing. Also, J
-    // could be before I if this is an inverted input.
-    for (BasicBlock::iterator E = BB.end(); L != E; ++L) {
-      if (trackUsesOfI(Users, WriteSet, I, &*L)) {
-        if (L->mayReadFromMemory()) {
-          LoadMoveSet[&*L].push_back(I);
-          LoadMoveSetPairs.insert(ValuePair(&*L, I));
-        }
-      }
-    }
-  }
-
-  // In cases where both load/stores and the computation of their pointers
-  // are chosen for vectorization, we can end up in a situation where the
-  // aliasing analysis starts returning different query results as the
-  // process of fusing instruction pairs continues. Because the algorithm
-  // relies on finding the same use dags here as were found earlier, we'll
-  // need to precompute the necessary aliasing information here and then
-  // manually update it during the fusion process.
-  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs) {
-    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
-         PIE = PairableInsts.end(); PI != PIE; ++PI) {
-      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
-      if (P == ChosenPairs.end()) continue;
-
-      Instruction *I = cast<Instruction>(P->first);
-      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
-                             LoadMoveSetPairs, I);
-    }
-  }
-
-  // This function fuses the chosen instruction pairs into vector instructions,
-  // taking care preserve any needed scalar outputs and, then, it reorders the
-  // remaining instructions as needed (users of the first member of the pair
-  // need to be moved to after the location of the second member of the pair
-  // because the vector instruction is inserted in the location of the pair's
-  // second member).
-  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
-    LLVMContext& Context = BB.getContext();
-
-    // During the vectorization process, the order of the pairs to be fused
-    // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
-    // list. After a pair is fused, the flipped pair is removed from the list.
-    DenseSet<ValuePair> FlippedPairs;
-    for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
-         E = ChosenPairs.end(); P != E; ++P)
-      FlippedPairs.insert(ValuePair(P->second, P->first));
-    for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
-         E = FlippedPairs.end(); P != E; ++P)
-      ChosenPairs.insert(*P);
-
-    DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
-    DenseSet<ValuePair> LoadMoveSetPairs;
-    collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
-                       LoadMoveSet, LoadMoveSetPairs);
-
-    DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
-
-    for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
-      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI);
-      if (P == ChosenPairs.end()) {
-        ++PI;
-        continue;
-      }
-
-      if (getDepthFactor(P->first) == 0) {
-        // These instructions are not really fused, but are tracked as though
-        // they are. Any case in which it would be interesting to fuse them
-        // will be taken care of by InstCombine.
-        --NumFusedOps;
-        ++PI;
-        continue;
-      }
-
-      Instruction *I = cast<Instruction>(P->first),
-        *J = cast<Instruction>(P->second);
-
-      DEBUG(dbgs() << "BBV: fusing: " << *I <<
-             " <-> " << *J << "\n");
-
-      // Remove the pair and flipped pair from the list.
-      DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
-      assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
-      ChosenPairs.erase(FP);
-      ChosenPairs.erase(P);
-
-      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
-        DEBUG(dbgs() << "BBV: fusion of: " << *I <<
-               " <-> " << *J <<
-               " aborted because of non-trivial dependency cycle\n");
-        --NumFusedOps;
-        ++PI;
-        continue;
-      }
-
-      // If the pair must have the other order, then flip it.
-      bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
-      if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
-        // This pair does not have a fixed order, and so we might want to
-        // flip it if that will yield fewer shuffles. We count the number
-        // of dependencies connected via swaps, and those directly connected,
-        // and flip the order if the number of swaps is greater.
-        bool OrigOrder = true;
-        DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
-          ConnectedPairDeps.find(ValuePair(I, J));
-        if (IJ == ConnectedPairDeps.end()) {
-          IJ = ConnectedPairDeps.find(ValuePair(J, I));
-          OrigOrder = false;
-        }
-
-        if (IJ != ConnectedPairDeps.end()) {
-          unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-          for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
-               TE = IJ->second.end(); T != TE; ++T) {
-            VPPair Q(IJ->first, *T);
-            DenseMap<VPPair, unsigned>::iterator R =
-              PairConnectionTypes.find(VPPair(Q.second, Q.first));
-            assert(R != PairConnectionTypes.end() &&
-                   "Cannot find pair connection type");
-            if (R->second == PairConnectionDirect)
-              ++NumDepsDirect;
-            else if (R->second == PairConnectionSwap)
-              ++NumDepsSwap;
-          }
-
-          if (!OrigOrder)
-            std::swap(NumDepsDirect, NumDepsSwap);
-
-          if (NumDepsSwap > NumDepsDirect) {
-            FlipPairOrder = true;
-            DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
-                            " <-> " << *J << "\n");
-          }
-        }
-      }
-
-      Instruction *L = I, *H = J;
-      if (FlipPairOrder)
-        std::swap(H, L);
-
-      // If the pair being fused uses the opposite order from that in the pair
-      // connection map, then we need to flip the types.
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
-        ConnectedPairs.find(ValuePair(H, L));
-      if (HL != ConnectedPairs.end())
-        for (std::vector<ValuePair>::iterator T = HL->second.begin(),
-             TE = HL->second.end(); T != TE; ++T) {
-          VPPair Q(HL->first, *T);
-          DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
-          assert(R != PairConnectionTypes.end() &&
-                 "Cannot find pair connection type");
-          if (R->second == PairConnectionDirect)
-            R->second = PairConnectionSwap;
-          else if (R->second == PairConnectionSwap)
-            R->second = PairConnectionDirect;
-        }
-
-      bool LBeforeH = !FlipPairOrder;
-      unsigned NumOperands = I->getNumOperands();
-      SmallVector<Value *, 3> ReplacedOperands(NumOperands);
-      getReplacementInputsForPair(Context, L, H, ReplacedOperands,
-                                  LBeforeH);
-
-      // Make a copy of the original operation, change its type to the vector
-      // type and replace its operands with the vector operands.
-      Instruction *K = L->clone();
-      if (L->hasName())
-        K->takeName(L);
-      else if (H->hasName())
-        K->takeName(H);
-
-      if (auto CS = CallSite(K)) {
-        SmallVector<Type *, 3> Tys;
-        FunctionType *Old = CS.getFunctionType();
-        unsigned NumOld = Old->getNumParams();
-        assert(NumOld <= ReplacedOperands.size());
-        for (unsigned i = 0; i != NumOld; ++i)
-          Tys.push_back(ReplacedOperands[i]->getType());
-        CS.mutateFunctionType(
-            FunctionType::get(getVecTypeForPair(L->getType(), H->getType()),
-                              Tys, Old->isVarArg()));
-      } else if (!isa<StoreInst>(K))
-        K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
-
-      unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
-                             LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
-                             LLVMContext::MD_invariant_group};
-      combineMetadata(K, H, KnownIDs);
-      K->andIRFlags(H);
-
-      for (unsigned o = 0; o < NumOperands; ++o)
-        K->setOperand(o, ReplacedOperands[o]);
-
-      K->insertAfter(J);
-
-      // Instruction insertion point:
-      Instruction *InsertionPt = K;
-      Instruction *K1 = nullptr, *K2 = nullptr;
-      replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
-
-      // The use dag of the first original instruction must be moved to after
-      // the location of the second instruction. The entire use dag of the
-      // first instruction is disjoint from the input dag of the second
-      // (by definition), and so commutes with it.
-
-      moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
-
-      if (!isa<StoreInst>(I)) {
-        L->replaceAllUsesWith(K1);
-        H->replaceAllUsesWith(K2);
-      }
-
-      // Instructions that may read from memory may be in the load move set.
-      // Once an instruction is fused, we no longer need its move set, and so
-      // the values of the map never need to be updated. However, when a load
-      // is fused, we need to merge the entries from both instructions in the
-      // pair in case those instructions were in the move set of some other
-      // yet-to-be-fused pair. The loads in question are the keys of the map.
-      if (I->mayReadFromMemory()) {
-        std::vector<ValuePair> NewSetMembers;
-        DenseMap<Value *, std::vector<Value *> >::iterator II =
-          LoadMoveSet.find(I);
-        if (II != LoadMoveSet.end())
-          for (std::vector<Value *>::iterator N = II->second.begin(),
-               NE = II->second.end(); N != NE; ++N)
-            NewSetMembers.push_back(ValuePair(K, *N));
-        DenseMap<Value *, std::vector<Value *> >::iterator JJ =
-          LoadMoveSet.find(J);
-        if (JJ != LoadMoveSet.end())
-          for (std::vector<Value *>::iterator N = JJ->second.begin(),
-               NE = JJ->second.end(); N != NE; ++N)
-            NewSetMembers.push_back(ValuePair(K, *N));
-        for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
-             AE = NewSetMembers.end(); A != AE; ++A) {
-          LoadMoveSet[A->first].push_back(A->second);
-          LoadMoveSetPairs.insert(*A);
-        }
-      }
-
-      // Before removing I, set the iterator to the next instruction.
-      PI = std::next(BasicBlock::iterator(I));
-      if (cast<Instruction>(PI) == J)
-        ++PI;
-
-      SE->forgetValue(I);
-      SE->forgetValue(J);
-      I->eraseFromParent();
-      J->eraseFromParent();
-
-      DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
-                                               BB << "\n");
-    }
-
-    DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
-  }
-}
-
-char BBVectorize::ID = 0;
-static const char bb_vectorize_name[] = "Basic-Block Vectorization";
-INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
-INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
-
-BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
-  return new BBVectorize(C);
-}
-
-bool
-llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
-  BBVectorize BBVectorizer(P, *BB.getParent(), C);
-  return BBVectorizer.vectorizeBB(BB);
-}
-
-//===----------------------------------------------------------------------===//
-VectorizeConfig::VectorizeConfig() {
-  VectorBits = ::VectorBits;
-  VectorizeBools = !::NoBools;
-  VectorizeInts = !::NoInts;
-  VectorizeFloats = !::NoFloats;
-  VectorizePointers = !::NoPointers;
-  VectorizeCasts = !::NoCasts;
-  VectorizeMath = !::NoMath;
-  VectorizeBitManipulations = !::NoBitManipulation;
-  VectorizeFMA = !::NoFMA;
-  VectorizeSelect = !::NoSelect;
-  VectorizeCmp = !::NoCmp;
-  VectorizeGEP = !::NoGEP;
-  VectorizeMemOps = !::NoMemOps;
-  AlignedOnly = ::AlignedOnly;
-  ReqChainDepth= ::ReqChainDepth;
-  SearchLimit = ::SearchLimit;
-  MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
-  SplatBreaksChain = ::SplatBreaksChain;
-  MaxInsts = ::MaxInsts;
-  MaxPairs = ::MaxPairs;
-  MaxIter = ::MaxIter;
-  Pow2LenOnly = ::Pow2LenOnly;
-  NoMemOpBoost = ::NoMemOpBoost;
-  FastDep = ::FastDep;
-}
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 395f440bda47..1aea73cd4a32 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_library(LLVMVectorize
-  BBVectorize.cpp
   LoadStoreVectorizer.cpp
   LoopVectorize.cpp
   SLPVectorizer.cpp
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index eac2867233bc..193cc4d13787 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -114,12 +114,13 @@ static cl::opt<bool>
     EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                        cl::desc("Enable if-conversion during vectorization."));
 
-/// We don't vectorize loops with a known constant trip count below this number.
+/// Loops with a known constant trip count below this number are vectorized only
+/// if no scalar iteration overheads are incurred.
 static cl::opt<unsigned> TinyTripCountVectorThreshold(
     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
-    cl::desc("Don't vectorize loops with a constant "
-             "trip count that is smaller than this "
-             "value."));
+    cl::desc("Loops with a constant trip count that is smaller than this "
+             "value are vectorized only if no scalar iteration overheads "
+             "are incurred."));
 
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -532,21 +533,34 @@ protected:
   /// Returns true if we should generate a scalar version of \p IV.
   bool needsScalarInduction(Instruction *IV) const;
 
-  /// Return a constant reference to the VectorParts corresponding to \p V from
-  /// the original loop. If the value has already been vectorized, the
-  /// corresponding vector entry in VectorLoopValueMap is returned. If,
+  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
+  /// vector or scalar value on-demand if one is not yet available. When
+  /// vectorizing a loop, we visit the definition of an instruction before its
+  /// uses. When visiting the definition, we either vectorize or scalarize the
+  /// instruction, creating an entry for it in the corresponding map. (In some
+  /// cases, such as induction variables, we will create both vector and scalar
+  /// entries.) Then, as we encounter uses of the definition, we derive values
+  /// for each scalar or vector use unless such a value is already available.
+  /// For example, if we scalarize a definition and one of its uses is vector,
+  /// we build the required vector on-demand with an insertelement sequence
+  /// when visiting the use. Otherwise, if the use is scalar, we can use the
+  /// existing scalar definition.
+  ///
+  /// Return a value in the new loop corresponding to \p V from the original
+  /// loop at unroll index \p Part. If the value has already been vectorized,
+  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
-  /// new vector values on-demand by inserting the scalar values into vectors
+  /// a new vector value on-demand by inserting the scalar values into a vector
   /// with an insertelement sequence. If the value has been neither vectorized
   /// nor scalarized, it must be loop invariant, so we simply broadcast the
-  /// value into vectors.
-  const VectorParts &getVectorValue(Value *V);
+  /// value into a vector.
+  Value *getOrCreateVectorValue(Value *V, unsigned Part);
 
   /// Return a value in the new loop corresponding to \p V from the original
   /// loop at unroll index \p Part and vector index \p Lane. If the value has
   /// been vectorized but not scalarized, the necessary extractelement
   /// instruction will be generated.
-  Value *getScalarValue(Value *V, unsigned Part, unsigned Lane);
+  Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane);
 
   /// Try to vectorize the interleaved access group that \p Instr belongs to.
   void vectorizeInterleaveGroup(Instruction *Instr);
@@ -601,90 +615,103 @@ protected:
   /// UF x VF scalar values in the new loop. UF and VF are the unroll and
   /// vectorization factors, respectively.
   ///
-  /// Entries can be added to either map with initVector and initScalar, which
-  /// initialize and return a constant reference to the new entry. If a
-  /// non-constant reference to a vector entry is required, getVector can be
-  /// used to retrieve a mutable entry. We currently directly modify the mapped
-  /// values during "fix-up" operations that occur once the first phase of
-  /// widening is complete. These operations include type truncation and the
-  /// second phase of recurrence widening.
+  /// Entries can be added to either map with setVectorValue and setScalarValue,
+  /// which assert that an entry was not already added before. If an entry is to
+  /// replace an existing one, call resetVectorValue. This is currently needed
+  /// to modify the mapped values during "fix-up" operations that occur once the
+  /// first phase of widening is complete. These operations include type
+  /// truncation and the second phase of recurrence widening.
   ///
-  /// Otherwise, entries from either map should be accessed using the
-  /// getVectorValue or getScalarValue functions from InnerLoopVectorizer.
-  /// getVectorValue and getScalarValue coordinate to generate a vector or
-  /// scalar value on-demand if one is not yet available. When vectorizing a
-  /// loop, we visit the definition of an instruction before its uses. When
-  /// visiting the definition, we either vectorize or scalarize the
-  /// instruction, creating an entry for it in the corresponding map. (In some
-  /// cases, such as induction variables, we will create both vector and scalar
-  /// entries.) Then, as we encounter uses of the definition, we derive values
-  /// for each scalar or vector use unless such a value is already available.
-  /// For example, if we scalarize a definition and one of its uses is vector,
-  /// we build the required vector on-demand with an insertelement sequence
-  /// when visiting the use. Otherwise, if the use is scalar, we can use the
-  /// existing scalar definition.
+  /// Entries from either map can be retrieved using the getVectorValue and
+  /// getScalarValue functions, which assert that the desired value exists.
+
   struct ValueMap {
 
     /// Construct an empty map with the given unroll and vectorization factors.
-    ValueMap(unsigned UnrollFactor, unsigned VecWidth)
-        : UF(UnrollFactor), VF(VecWidth) {
-      // The unroll and vectorization factors are only used in asserts builds
-      // to verify map entries are sized appropriately.
-      (void)UF;
-      (void)VF;
+    ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
+
+    /// \return True if the map has any vector entry for \p Key.
+    bool hasAnyVectorValue(Value *Key) const {
+      return VectorMapStorage.count(Key);
+    }
+
+    /// \return True if the map has a vector entry for \p Key and \p Part.
+    bool hasVectorValue(Value *Key, unsigned Part) const {
+      assert(Part < UF && "Queried Vector Part is too large.");
+      if (!hasAnyVectorValue(Key))
+        return false;
+      const VectorParts &Entry = VectorMapStorage.find(Key)->second;
+      assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
+      return Entry[Part] != nullptr;
     }
 
-    /// \return True if the map has a vector entry for \p Key.
-    bool hasVector(Value *Key) const { return VectorMapStorage.count(Key); }
-
-    /// \return True if the map has a scalar entry for \p Key.
-    bool hasScalar(Value *Key) const { return ScalarMapStorage.count(Key); }
-
-    /// \brief Map \p Key to the given VectorParts \p Entry, and return a
-    /// constant reference to the new vector map entry. The given key should
-    /// not already be in the map, and the given VectorParts should be
-    /// correctly sized for the current unroll factor.
-    const VectorParts &initVector(Value *Key, const VectorParts &Entry) {
-      assert(!hasVector(Key) && "Vector entry already initialized");
-      assert(Entry.size() == UF && "VectorParts has wrong dimensions");
-      VectorMapStorage[Key] = Entry;
-      return VectorMapStorage[Key];
+    /// \return True if the map has any scalar entry for \p Key.
+    bool hasAnyScalarValue(Value *Key) const {
+      return ScalarMapStorage.count(Key);
     }
 
-    /// \brief Map \p Key to the given ScalarParts \p Entry, and return a
-    /// constant reference to the new scalar map entry. The given key should
-    /// not already be in the map, and the given ScalarParts should be
-    /// correctly sized for the current unroll and vectorization factors.
-    const ScalarParts &initScalar(Value *Key, const ScalarParts &Entry) {
-      assert(!hasScalar(Key) && "Scalar entry already initialized");
-      assert(Entry.size() == UF &&
-             all_of(make_range(Entry.begin(), Entry.end()),
-                    [&](const SmallVectorImpl<Value *> &Values) -> bool {
-                      return Values.size() == VF;
-                    }) &&
-             "ScalarParts has wrong dimensions");
-      ScalarMapStorage[Key] = Entry;
-      return ScalarMapStorage[Key];
+    /// \return True if the map has a scalar entry for \p Key, \p Part and
+    /// \p Part.
+    bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const {
+      assert(Part < UF && "Queried Scalar Part is too large.");
+      assert(Lane < VF && "Queried Scalar Lane is too large.");
+      if (!hasAnyScalarValue(Key))
+        return false;
+      const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
+      assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
+      assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions.");
+      return Entry[Part][Lane] != nullptr;
     }
 
-    /// \return A reference to the vector map entry corresponding to \p Key.
-    /// The key should already be in the map. This function should only be used
-    /// when it's necessary to update values that have already been vectorized.
-    /// This is the case for "fix-up" operations including type truncation and
-    /// the second phase of recurrence vectorization. If a non-const reference
-    /// isn't required, getVectorValue should be used instead.
-    VectorParts &getVector(Value *Key) {
-      assert(hasVector(Key) && "Vector entry not initialized");
-      return VectorMapStorage.find(Key)->second;
+    /// Retrieve the existing vector value that corresponds to \p Key and
+    /// \p Part.
+    Value *getVectorValue(Value *Key, unsigned Part) {
+      assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
+      return VectorMapStorage[Key][Part];
     }
 
-    /// Retrieve an entry from the vector or scalar maps. The preferred way to
-    /// access an existing mapped entry is with getVectorValue or
-    /// getScalarValue from InnerLoopVectorizer. Until those functions can be
-    /// moved inside ValueMap, we have to declare them as friends.
-    friend const VectorParts &InnerLoopVectorizer::getVectorValue(Value *V);
-    friend Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
-                                                      unsigned Lane);
+    /// Retrieve the existing scalar value that corresponds to \p Key, \p Part
+    /// and \p Lane.
+    Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) {
+      assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value.");
+      return ScalarMapStorage[Key][Part][Lane];
+    }
+
+    /// Set a vector value associated with \p Key and \p Part. Assumes such a
+    /// value is not already set. If it is, use resetVectorValue() instead.
+    void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
+      assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
+      if (!VectorMapStorage.count(Key)) {
+        VectorParts Entry(UF);
+        VectorMapStorage[Key] = Entry;
+      }
+      VectorMapStorage[Key][Part] = Vector;
+    }
+
+    /// Set a scalar value associated with \p Key for \p Part and \p Lane.
+    /// Assumes such a value is not already set.
+    void setScalarValue(Value *Key, unsigned Part, unsigned Lane,
+                        Value *Scalar) {
+      assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set");
+      if (!ScalarMapStorage.count(Key)) {
+        ScalarParts Entry(UF);
+        for (unsigned Part = 0; Part < UF; ++Part)
+          Entry[Part].resize(VF, nullptr);
+          // TODO: Consider storing uniform values only per-part, as they occupy
+          //       lane 0 only, keeping the other VF-1 redundant entries null.
+        ScalarMapStorage[Key] = Entry;
+      }
+      ScalarMapStorage[Key][Part][Lane] = Scalar;
+    }
+
+    /// Reset the vector value associated with \p Key for the given \p Part.
+    /// This function can be used to update values that have already been
+    /// vectorized. This is the case for "fix-up" operations including type
+    /// truncation and the second phase of recurrence vectorization.
+    void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
+      assert(hasVectorValue(Key, Part) && "Vector value not set for part");
+      VectorMapStorage[Key][Part] = Vector;
+    }
 
   private:
     /// The unroll factor. Each entry in the vector map contains UF vector
@@ -1577,6 +1604,9 @@ public:
   /// Return the first-order recurrences found in the loop.
   RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
 
+  /// Return the set of instructions to sink to handle first-order recurrences.
+  DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
+
   /// Returns the widest induction type.
   Type *getWidestInductionType() { return WidestIndTy; }
 
@@ -1779,6 +1809,9 @@ private:
   InductionList Inductions;
   /// Holds the phi nodes that are first-order recurrences.
   RecurrenceSet FirstOrderRecurrences;
+  /// Holds instructions that need to sink past other instructions to handle
+  /// first-order recurrences.
+  DenseMap<Instruction *, Instruction *> SinkAfter;
   /// Holds the widest induction type encountered.
   Type *WidestIndTy;
 
@@ -2417,15 +2450,13 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
                                     &*LoopVectorBody->getFirstInsertionPt());
   Instruction *LastInduction = VecInd;
-  VectorParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part] = LastInduction;
+    VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
+    if (isa<TruncInst>(EntryVal))
+      addMetadata(LastInduction, EntryVal);
     LastInduction = cast<Instruction>(addFastMathFlag(
         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
   }
-  VectorLoopValueMap.initVector(EntryVal, Entry);
-  if (isa<TruncInst>(EntryVal))
-    addMetadata(Entry, EntryVal);
 
   // Move the last step to the end of the latch block. This ensures consistent
   // placement of all induction updates.
@@ -2531,13 +2562,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
   // induction variable, and build the necessary step vectors.
   if (!VectorizedIV) {
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
-    VectorParts Entry(UF);
-    for (unsigned Part = 0; Part < UF; ++Part)
-      Entry[Part] =
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *EntryPart =
           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
-    VectorLoopValueMap.initVector(EntryVal, Entry);
-    if (Trunc)
-      addMetadata(Entry, Trunc);
+      VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
+      if (Trunc)
+        addMetadata(EntryPart, Trunc);
+    }
   }
 
   // If an induction variable is only used for counting loop iterations or
@@ -2637,17 +2668,14 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
     Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
 
   // Compute the scalar steps and save the results in VectorLoopValueMap.
-  ScalarParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part].resize(VF);
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
-      Entry[Part][Lane] = Add;
+      VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add);
     }
   }
-  VectorLoopValueMap.initScalar(EntryVal, Entry);
 }
 
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
@@ -2665,8 +2693,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
   return LAI->isUniform(V);
 }
 
-const InnerLoopVectorizer::VectorParts &
-InnerLoopVectorizer::getVectorValue(Value *V) {
+Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
   assert(V != Induction && "The new induction variable should not be used.");
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
@@ -2675,17 +2702,16 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
   if (Legal->hasStride(V))
     V = ConstantInt::get(V->getType(), 1);
 
-  // If we have this scalar in the map, return it.
-  if (VectorLoopValueMap.hasVector(V))
-    return VectorLoopValueMap.VectorMapStorage[V];
+  // If we have a vector mapped to this value, return it.
+  if (VectorLoopValueMap.hasVectorValue(V, Part))
+    return VectorLoopValueMap.getVectorValue(V, Part);
 
   // If the value has not been vectorized, check if it has been scalarized
   // instead. If it has been scalarized, and we actually need the value in
   // vector form, we will construct the vector values on demand.
-  if (VectorLoopValueMap.hasScalar(V)) {
+  if (VectorLoopValueMap.hasAnyScalarValue(V)) {
 
-    // Initialize a new vector map entry.
-    VectorParts Entry(UF);
+    Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0);
 
     // If we've scalarized a value, that value should be an instruction.
     auto *I = cast<Instruction>(V);
@@ -2693,17 +2719,17 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     // If we aren't vectorizing, we can just copy the scalar map values over to
     // the vector map.
     if (VF == 1) {
-      for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part] = getScalarValue(V, Part, 0);
-      return VectorLoopValueMap.initVector(V, Entry);
+      VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
+      return ScalarValue;
     }
 
-    // Get the last scalar instruction we generated for V. If the value is
-    // known to be uniform after vectorization, this corresponds to lane zero
-    // of the last unroll iteration. Otherwise, the last instruction is the one
-    // we created for the last vector lane of the last unroll iteration.
+    // Get the last scalar instruction we generated for V and Part. If the value
+    // is known to be uniform after vectorization, this corresponds to lane zero
+    // of the Part unroll iteration. Otherwise, the last instruction is the one
+    // we created for the last vector lane of the Part unroll iteration.
     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
-    auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
+    auto *LastInst =
+        cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane));
 
     // Set the insert point after the last scalarized instruction. This ensures
     // the insertelement sequence will directly follow the scalar definitions.
@@ -2717,52 +2743,50 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     // iteration. Otherwise, we construct the vector values using insertelement
     // instructions. Since the resulting vectors are stored in
     // VectorLoopValueMap, we will only generate the insertelements once.
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *VectorValue = nullptr;
-      if (Cost->isUniformAfterVectorization(I, VF)) {
-        VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
-      } else {
-        VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
-        for (unsigned Lane = 0; Lane < VF; ++Lane)
-          VectorValue = Builder.CreateInsertElement(
-              VectorValue, getScalarValue(V, Part, Lane),
-              Builder.getInt32(Lane));
-      }
-      Entry[Part] = VectorValue;
+    Value *VectorValue = nullptr;
+    if (Cost->isUniformAfterVectorization(I, VF)) {
+      VectorValue = getBroadcastInstrs(ScalarValue);
+    } else {
+      VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
+      for (unsigned Lane = 0; Lane < VF; ++Lane)
+        VectorValue = Builder.CreateInsertElement(
+            VectorValue, getOrCreateScalarValue(V, Part, Lane),
+            Builder.getInt32(Lane));
     }
+    VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
     Builder.restoreIP(OldIP);
-    return VectorLoopValueMap.initVector(V, Entry);
+    return VectorValue;
   }
 
   // If this scalar is unknown, assume that it is a constant or that it is
   // loop invariant. Broadcast V and save the value for future uses.
   Value *B = getBroadcastInstrs(V);
-  return VectorLoopValueMap.initVector(V, VectorParts(UF, B));
+  VectorLoopValueMap.setVectorValue(V, Part, B);
+  return B;
 }
 
-Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
-                                           unsigned Lane) {
+Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,
+                                                   unsigned Lane) {
 
   // If the value is not an instruction contained in the loop, it should
   // already be scalar.
   if (OrigLoop->isLoopInvariant(V))
     return V;
 
-  assert(Lane > 0 ?
-         !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
-         : true && "Uniform values only have lane zero");
+  assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+                  : true && "Uniform values only have lane zero");
 
   // If the value from the original loop has not been vectorized, it is
   // represented by UF x VF scalar values in the new loop. Return the requested
   // scalar value.
-  if (VectorLoopValueMap.hasScalar(V))
-    return VectorLoopValueMap.ScalarMapStorage[V][Part][Lane];
+  if (VectorLoopValueMap.hasScalarValue(V, Part, Lane))
+    return VectorLoopValueMap.getScalarValue(V, Part, Lane);
 
   // If the value has not been scalarized, get its entry in VectorLoopValueMap
   // for the given unroll part. If this entry is not a vector type (i.e., the
   // vectorization factor is one), there is no need to generate an
   // extractelement instruction.
-  auto *U = getVectorValue(V)[Part];
+  auto *U = getOrCreateVectorValue(V, Part);
   if (!U->getType()->isVectorTy()) {
     assert(VF == 1 && "Value not scalarized has non-vector type");
     return U;
@@ -2844,7 +2868,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     Index += (VF - 1) * Group->getFactor();
 
   for (unsigned Part = 0; Part < UF; Part++) {
-    Value *NewPtr = getScalarValue(Ptr, Part, 0);
+    Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0);
 
     // Notice current instruction could be any index. Need to adjust the address
     // to the member of index 0.
@@ -2887,7 +2911,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
       if (!Member)
         continue;
 
-      VectorParts Entry(UF);
       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
       for (unsigned Part = 0; Part < UF; Part++) {
         Value *StridedVec = Builder.CreateShuffleVector(
@@ -2899,10 +2922,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
           StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
         }
 
-        Entry[Part] =
-            Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
+        if (Group->isReverse())
+          StridedVec = reverseVector(StridedVec);
+
+        VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
       }
-      VectorLoopValueMap.initVector(Member, Entry);
     }
     return;
   }
@@ -2919,8 +2943,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
       Instruction *Member = Group->getMember(i);
       assert(Member && "Fail to get a member from an interleaved store group");
 
-      Value *StoredVec =
-          getVectorValue(cast<StoreInst>(Member)->getValueOperand())[Part];
+      Value *StoredVec = getOrCreateVectorValue(
+          cast<StoreInst>(Member)->getValueOperand(), Part);
       if (Group->isReverse())
         StoredVec = reverseVector(StoredVec);
 
@@ -2981,16 +3005,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   bool CreateGatherScatter =
       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
 
-  VectorParts VectorGep;
+  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
+  // gather/scatter. Otherwise Decision should have been to Scalarize.
+  assert((ConsecutiveStride || CreateGatherScatter) &&
+         "The instruction should be scalarized");
 
   // Handle consecutive loads/stores.
-  if (ConsecutiveStride) {
-    Ptr = getScalarValue(Ptr, 0, 0);
-  } else {
-    // At this point we should vector version of GEP for Gather or Scatter
-    assert(CreateGatherScatter && "The instruction should be scalarized");
-    VectorGep = getVectorValue(Ptr);
-  }
+  if (ConsecutiveStride)
+    Ptr = getOrCreateScalarValue(Ptr, 0, 0);
 
   VectorParts Mask = createBlockInMask(Instr->getParent());
   // Handle Stores:
@@ -2998,16 +3020,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     assert(!Legal->isUniform(SI->getPointerOperand()) &&
            "We do not allow storing to uniform addresses");
     setDebugLocFromInst(Builder, SI);
-    // We don't want to update the value in the map as it might be used in
-    // another expression. So don't use a reference type for "StoredVal".
-    VectorParts StoredVal = getVectorValue(SI->getValueOperand());
 
     for (unsigned Part = 0; Part < UF; ++Part) {
       Instruction *NewSI = nullptr;
+      Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
       if (CreateGatherScatter) {
         Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
-        NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part],
-                                            Alignment, MaskPart);
+        Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+                                            MaskPart);
       } else {
         // Calculate the pointer for the specific unroll-part.
         Value *PartPtr =
@@ -3016,7 +3037,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
         if (Reverse) {
           // If we store to reverse consecutive memory locations, then we need
           // to reverse the order of elements in the stored value.
-          StoredVal[Part] = reverseVector(StoredVal[Part]);
+          StoredVal = reverseVector(StoredVal);
+          // We don't want to update the value in the map as it might be used in
+          // another expression. So don't call resetVectorValue(StoredVal).
+
           // If the address is consecutive but reversed, then the
           // wide store needs to start at the last vector element.
           PartPtr =
@@ -3030,11 +3054,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
             Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
 
         if (Legal->isMaskRequired(SI))
-          NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
+          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                             Mask[Part]);
         else
-          NewSI =
-              Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
+          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
       }
       addMetadata(NewSI, SI);
     }
@@ -3044,14 +3067,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   // Handle loads.
   assert(LI && "Must have a load instruction");
   setDebugLocFromInst(Builder, LI);
-  VectorParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Instruction *NewLI;
+    Value *NewLI;
     if (CreateGatherScatter) {
       Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
-      NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, MaskPart,
+      Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
                                          nullptr, "wide.masked.gather");
-      Entry[Part] = NewLI;
+      addMetadata(NewLI, LI);
     } else {
       // Calculate the pointer for the specific unroll-part.
       Value *PartPtr =
@@ -3073,11 +3096,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
                                          "wide.masked.load");
       else
         NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
-      Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
+
+      // Add metadata to the load, but setVectorValue to the reverse shuffle.
+      addMetadata(NewLI, LI);
+      if (Reverse)
+        NewLI = reverseVector(NewLI);
     }
-    addMetadata(NewLI, LI);
+    VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
   }
-  VectorLoopValueMap.initVector(Instr, Entry);
 }
 
 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
@@ -3094,9 +3120,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
-  // Initialize a new scalar map entry.
-  ScalarParts Entry(UF);
-
   VectorParts Cond;
   if (IfPredicateInstr)
     Cond = createBlockInMask(Instr->getParent());
@@ -3108,7 +3131,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
 
   // For each vector unroll 'part':
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part].resize(VF);
     // For each scalar that we create:
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
 
@@ -3129,7 +3151,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
       // Replace the operands of the cloned instructions with their scalar
       // equivalents in the new loop.
       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-        auto *NewOp = getScalarValue(Instr->getOperand(op), Part, Lane);
+        auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane);
         Cloned->setOperand(op, NewOp);
       }
       addNewMetadata(Cloned, Instr);
@@ -3138,7 +3160,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
       Builder.Insert(Cloned);
 
       // Add the cloned scalar to the scalar map entry.
-      Entry[Part][Lane] = Cloned;
+      VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned);
 
       // If we just cloned a new assumption, add it the assumption cache.
       if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
@@ -3150,7 +3172,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
         PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
     }
   }
-  VectorLoopValueMap.initScalar(Instr, Entry);
 }
 
 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
@@ -3786,10 +3807,10 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
     // If the value wasn't vectorized, we must maintain the original scalar
     // type. The absence of the value from VectorLoopValueMap indicates that it
     // wasn't vectorized.
-    if (!VectorLoopValueMap.hasVector(KV.first))
+    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
       continue;
-    VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
-    for (Value *&I : Parts) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *I = getOrCreateVectorValue(KV.first, Part);
       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
         continue;
       Type *OriginalTy = I->getType();
@@ -3878,7 +3899,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
       I->replaceAllUsesWith(Res);
       cast<Instruction>(I)->eraseFromParent();
       Erased.insert(I);
-      I = Res;
+      VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
     }
   }
 
@@ -3887,15 +3908,15 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
     // If the value wasn't vectorized, we must maintain the original scalar
     // type. The absence of the value from VectorLoopValueMap indicates that it
     // wasn't vectorized.
-    if (!VectorLoopValueMap.hasVector(KV.first))
+    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
       continue;
-    VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
-    for (Value *&I : Parts) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *I = getOrCreateVectorValue(KV.first, Part);
       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
       if (Inst && Inst->use_empty()) {
         Value *NewI = Inst->getOperand(0);
         Inst->eraseFromParent();
-        I = NewI;
+        VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
       }
     }
   }
@@ -4025,28 +4046,29 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // We constructed a temporary phi node in the first phase of vectorization.
   // This phi node will eventually be deleted.
-  VectorParts &PhiParts = VectorLoopValueMap.getVector(Phi);
-  Builder.SetInsertPoint(cast<Instruction>(PhiParts[0]));
+  Builder.SetInsertPoint(
+      cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
 
   // Create a phi node for the new recurrence. The current value will either be
   // the initial value inserted into a vector or loop-varying vector value.
   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
 
-  // Get the vectorized previous value.
-  auto &PreviousParts = getVectorValue(Previous);
+  // Get the vectorized previous value of the last part UF - 1. It appears last
+  // among all unrolled iterations, due to the order of their construction.
+  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
 
   // Set the insertion point after the previous value if it is an instruction.
   // Note that the previous value may have been constant-folded so it is not
   // guaranteed to be an instruction in the vector loop. Also, if the previous
   // value is a phi node, we should insert after all the phi nodes to avoid
   // breaking basic block verification.
-  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) ||
-      isa<PHINode>(PreviousParts[UF - 1]))
+  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
+      isa<PHINode>(PreviousLastPart))
     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
   else
     Builder.SetInsertPoint(
-        &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));
+        &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
 
   // We will construct a vector for the recurrence by combining the values for
   // the current and previous iterations. This is the required shuffle mask.
@@ -4061,15 +4083,16 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // Shuffle the current and previous vector and update the vector parts.
   for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
+    Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
     auto *Shuffle =
-        VF > 1
-            ? Builder.CreateShuffleVector(Incoming, PreviousParts[Part],
-                                          ConstantVector::get(ShuffleMask))
-            : Incoming;
-    PhiParts[Part]->replaceAllUsesWith(Shuffle);
-    cast<Instruction>(PhiParts[Part])->eraseFromParent();
-    PhiParts[Part] = Shuffle;
-    Incoming = PreviousParts[Part];
+        VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
+                                             ConstantVector::get(ShuffleMask))
+               : Incoming;
+    PhiPart->replaceAllUsesWith(Shuffle);
+    cast<Instruction>(PhiPart)->eraseFromParent();
+    VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
+    Incoming = PreviousPart;
   }
 
   // Fix the latch value of the new recurrence in the vector loop.
@@ -4097,7 +4120,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   // `Incoming`. This is analogous to the vectorized case above: extracting the
   // second last element when VF > 1.
   else if (UF > 1)
-    ExtractForPhiUsedOutsideLoop = PreviousParts[UF - 2];
+    ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
 
   // Fix the initial value of the original recurrence in the scalar loop.
   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
@@ -4148,8 +4171,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
 
   // This is the vector-clone of the value that leaves the loop.
-  const VectorParts &VectorExit = getVectorValue(LoopExitInst);
-  Type *VecTy = VectorExit[0]->getType();
+  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
 
   // Find the reduction identity variable. Zero for addition, or, xor,
   // one for multiplication, -1 for And.
@@ -4187,18 +4209,17 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
 
   // Reductions do not have to start at zero. They can start with
   // any loop invariant values.
-  const VectorParts &VecRdxPhi = getVectorValue(Phi);
   BasicBlock *Latch = OrigLoop->getLoopLatch();
   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
-  const VectorParts &Val = getVectorValue(LoopVal);
-  for (unsigned part = 0; part < UF; ++part) {
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
+    Value *Val = getOrCreateVectorValue(LoopVal, Part);
     // Make sure to add the reduction stat value only to the
     // first unroll part.
-    Value *StartVal = (part == 0) ? VectorStart : Identity;
-    cast<PHINode>(VecRdxPhi[part])
-      ->addIncoming(StartVal, LoopVectorPreHeader);
-    cast<PHINode>(VecRdxPhi[part])
-      ->addIncoming(Val[part], LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+    Value *StartVal = (Part == 0) ? VectorStart : Identity;
+    cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
+    cast<PHINode>(VecRdxPhi)
+      ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
   }
 
   // Before each round, move the insertion point right between
@@ -4207,7 +4228,6 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   // instructions.
   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
 
-  VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);
   setDebugLocFromInst(Builder, LoopExitInst);
 
   // If the vector reduction can be performed in a smaller type, we truncate
@@ -4216,37 +4236,42 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
     Builder.SetInsertPoint(LoopVectorBody->getTerminator());
-    for (unsigned part = 0; part < UF; ++part) {
-      Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+    VectorParts RdxParts(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+      Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
-        : Builder.CreateZExt(Trunc, VecTy);
-      for (Value::user_iterator UI = RdxParts[part]->user_begin();
-           UI != RdxParts[part]->user_end();)
+                                        : Builder.CreateZExt(Trunc, VecTy);
+      for (Value::user_iterator UI = RdxParts[Part]->user_begin();
+           UI != RdxParts[Part]->user_end();)
         if (*UI != Trunc) {
-          (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
-          RdxParts[part] = Extnd;
+          (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
+          RdxParts[Part] = Extnd;
         } else {
           ++UI;
         }
     }
     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
-    for (unsigned part = 0; part < UF; ++part)
-      RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
+    }
   }
 
   // Reduce all of the unrolled parts into a single vector.
-  Value *ReducedPartRdx = RdxParts[0];
+  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
   setDebugLocFromInst(Builder, ReducedPartRdx);
-  for (unsigned part = 1; part < UF; ++part) {
+  for (unsigned Part = 1; Part < UF; ++Part) {
+    Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
       // Floating point operations had to be 'fast' to enable the reduction.
       ReducedPartRdx = addFastMathFlag(
-          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
+          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
                               ReducedPartRdx, "bin.rdx"));
     else
       ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
-          Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
+          Builder, MinMaxKind, ReducedPartRdx, RdxPart);
   }
 
   if (VF > 1) {
@@ -4518,14 +4543,16 @@ InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   assert(BI && "Unexpected terminator found");
 
   if (BI->isConditional()) {
-    VectorParts EdgeMask = getVectorValue(BI->getCondition());
 
-    if (BI->getSuccessor(0) != Dst)
-      for (unsigned part = 0; part < UF; ++part)
-        EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
+    VectorParts EdgeMask(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
+      if (BI->getSuccessor(0) != Dst)
+        EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
 
-    for (unsigned part = 0; part < UF; ++part)
-      EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
+      EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
+      EdgeMask[Part] = EdgeMaskPart;
+    }
 
     EdgeMaskCache[Edge] = EdgeMask;
     return EdgeMask;
@@ -4544,23 +4571,27 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
   if (BCEntryIt != BlockMaskCache.end())
     return BCEntryIt->second;
 
+  VectorParts BlockMask(UF);
+
   // Loop incoming mask is all-one.
   if (OrigLoop->getHeader() == BB) {
     Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
-    const VectorParts &BlockMask = getVectorValue(C);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockMask[Part] = getOrCreateVectorValue(C, Part);
     BlockMaskCache[BB] = BlockMask;
     return BlockMask;
   }
 
   // This is the block mask. We OR all incoming edges, and with zero.
   Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
-  VectorParts BlockMask = getVectorValue(Zero);
+  for (unsigned Part = 0; Part < UF; ++Part)
+    BlockMask[Part] = getOrCreateVectorValue(Zero, Part);
 
   // For each pred:
-  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
-    VectorParts EM = createEdgeMask(*it, BB);
-    for (unsigned part = 0; part < UF; ++part)
-      BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
+  for (pred_iterator It = pred_begin(BB), E = pred_end(BB); It != E; ++It) {
+    VectorParts EM = createEdgeMask(*It, BB);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EM[Part]);
   }
 
   BlockMaskCache[BB] = BlockMask;
@@ -4575,15 +4606,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
   // this value when we vectorize all of the instructions that use the PHI.
   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
-    VectorParts Entry(UF);
-    for (unsigned part = 0; part < UF; ++part) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
       // This is phase one of vectorizing PHIs.
       Type *VecTy =
           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
-      Entry[part] = PHINode::Create(
+      Value *EntryPart = PHINode::Create(
           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
+      VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
     }
-    VectorLoopValueMap.initVector(P, Entry);
     return;
   }
 
@@ -4607,21 +4637,22 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     for (unsigned In = 0; In < NumIncoming; In++) {
       VectorParts Cond =
           createEdgeMask(P->getIncomingBlock(In), P->getParent());
-      const VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
 
-      for (unsigned part = 0; part < UF; ++part) {
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
         // We might have single edge PHIs (blocks) - use an identity
         // 'select' for the first PHI operand.
         if (In == 0)
-          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In0[part]);
+          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, In0);
         else
           // Select between the current value and the previous incoming edge
           // based on the incoming mask.
-          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], Entry[part],
+          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
                                              "predphi");
       }
     }
-    VectorLoopValueMap.initVector(P, Entry);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
     return;
   }
 
@@ -4652,18 +4683,15 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
     // These are the scalar results. Notice that we don't generate vector GEPs
     // because scalar GEPs result in better code.
-    ScalarParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      Entry[Part].resize(VF);
       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
         Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
         SclrGep->setName("next.gep");
-        Entry[Part][Lane] = SclrGep;
+        VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep);
       }
     }
-    VectorLoopValueMap.initScalar(P, Entry);
     return;
   }
   }
@@ -4713,7 +4741,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
     // is vector-typed. Thus, to keep the representation compact, we only use
     // vector-typed operands for loop-varying values.
     auto *GEP = cast<GetElementPtrInst>(&I);
-    VectorParts Entry(UF);
 
     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
       // If we are vectorizing, but the GEP has only loop-invariant operands,
@@ -4729,8 +4756,11 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
       //       collectLoopScalars() and teach getVectorValue() to broadcast
       //       the lane-zero scalar value.
       auto *Clone = Builder.Insert(GEP->clone());
-      for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part] = Builder.CreateVectorSplat(VF, Clone);
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
+        VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
+        addMetadata(EntryPart, GEP);
+      }
     } else {
       // If the GEP has at least one loop-varying operand, we are sure to
       // produce a vector of pointers. But if we are only unrolling, we want
@@ -4743,9 +4773,10 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
 
         // The pointer operand of the new GEP. If it's loop-invariant, we
         // won't broadcast it.
-        auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand())
-                        ? GEP->getPointerOperand()
-                        : getVectorValue(GEP->getPointerOperand())[Part];
+        auto *Ptr =
+            OrigLoop->isLoopInvariant(GEP->getPointerOperand())
+                ? GEP->getPointerOperand()
+                : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
 
         // Collect all the indices for the new GEP. If any index is
         // loop-invariant, we won't broadcast it.
@@ -4754,7 +4785,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
           if (OrigLoop->isLoopInvariant(U.get()))
             Indices.push_back(U.get());
           else
-            Indices.push_back(getVectorValue(U.get())[Part]);
+            Indices.push_back(getOrCreateVectorValue(U.get(), Part));
         }
 
         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
@@ -4764,12 +4795,11 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
                            : Builder.CreateGEP(Ptr, Indices);
         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
                "NewGEP is not a pointer vector");
-        Entry[Part] = NewGEP;
+        VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
+        addMetadata(NewGEP, GEP);
       }
     }
 
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, GEP);
     break;
   }
   case Instruction::UDiv:
@@ -4800,22 +4830,20 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
     // Just widen binops.
     auto *BinOp = cast<BinaryOperator>(&I);
     setDebugLocFromInst(Builder, BinOp);
-    const VectorParts &A = getVectorValue(BinOp->getOperand(0));
-    const VectorParts &B = getVectorValue(BinOp->getOperand(1));
 
-    // Use this vector value for all users of the original instruction.
-    VectorParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
+      Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
+      Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
+      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
 
       if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
         VecOp->copyIRFlags(BinOp);
 
-      Entry[Part] = V;
+      // Use this vector value for all users of the original instruction.
+      VectorLoopValueMap.setVectorValue(&I, Part, V);
+      addMetadata(V, BinOp);
     }
 
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, BinOp);
     break;
   }
   case Instruction::Select: {
@@ -4831,20 +4859,19 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
     // loop. This means that we can't just use the original 'cond' value.
     // We have to take the 'vectorized' value and pick the first lane.
     // Instcombine will make this a no-op.
-    const VectorParts &Cond = getVectorValue(I.getOperand(0));
-    const VectorParts &Op0 = getVectorValue(I.getOperand(1));
-    const VectorParts &Op1 = getVectorValue(I.getOperand(2));
 
-    auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
+    auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0);
 
-    VectorParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      Entry[Part] = Builder.CreateSelect(
-          InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
+      Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
+      Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
+      Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
+      Value *Sel =
+          Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
+      VectorLoopValueMap.setVectorValue(&I, Part, Sel);
+      addMetadata(Sel, &I);
     }
 
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, &I);
     break;
   }
 
@@ -4854,22 +4881,20 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
     bool FCmp = (I.getOpcode() == Instruction::FCmp);
     auto *Cmp = dyn_cast<CmpInst>(&I);
     setDebugLocFromInst(Builder, Cmp);
-    const VectorParts &A = getVectorValue(Cmp->getOperand(0));
-    const VectorParts &B = getVectorValue(Cmp->getOperand(1));
-    VectorParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
+      Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
       Value *C = nullptr;
       if (FCmp) {
-        C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
+        C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
         cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
       } else {
-        C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
+        C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
       }
-      Entry[Part] = C;
+      VectorLoopValueMap.setVectorValue(&I, Part, C);
+      addMetadata(C, &I);
     }
 
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, &I);
     break;
   }
 
@@ -4906,12 +4931,12 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
     Type *DestTy =
         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
 
-    const VectorParts &A = getVectorValue(CI->getOperand(0));
-    VectorParts Entry(UF);
-    for (unsigned Part = 0; Part < UF; ++Part)
-      Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, &I);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
+      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+      VectorLoopValueMap.setVectorValue(&I, Part, Cast);
+      addMetadata(Cast, &I);
+    }
     break;
   }
 
@@ -4949,17 +4974,14 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
       break;
     }
 
-    VectorParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
       SmallVector<Value *, 4> Args;
       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
         Value *Arg = CI->getArgOperand(i);
         // Some intrinsics have a scalar argument - don't replace it with a
         // vector.
-        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
-          const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
-          Arg = VectorArg[Part];
-        }
+        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
+          Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
         Args.push_back(Arg);
       }
 
@@ -4992,11 +5014,10 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
       if (isa<FPMathOperator>(V))
         V->copyFastMathFlags(CI);
 
-      Entry[Part] = V;
+      VectorLoopValueMap.setVectorValue(&I, Part, V);
+      addMetadata(V, &I);
     }
 
-    VectorLoopValueMap.initVector(&I, Entry);
-    addMetadata(Entry, &I);
     break;
   }
 
@@ -5363,7 +5384,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
 
-        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) {
+        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
+                                                         SinkAfter, DT)) {
           FirstOrderRecurrences.insert(Phi);
           continue;
         }
@@ -7636,6 +7658,15 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
 
   // 2. Copy and widen instructions from the old loop into the new loop.
 
+  // Move instructions to handle first-order recurrences.
+  DenseMap<Instruction *, Instruction *> SinkAfter = Legal->getSinkAfter();
+  for (auto &Entry : SinkAfter) {
+    Entry.first->removeFromParent();
+    Entry.first->insertAfter(Entry.second);
+    DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second
+                 << " to vectorize a 1st order recurrence.\n");
+  }
+
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
   // example, original induction update instructions can become dead because we
@@ -7787,8 +7818,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  // Check the loop for a trip count threshold:
-  // do not vectorize loops with a tiny trip count.
+  PredicatedScalarEvolution PSE(*SE, *L);
+
+  // Check if it is legal to vectorize the loop.
+  LoopVectorizationRequirements Requirements(*ORE);
+  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
+                                &Requirements, &Hints);
+  if (!LVL.canVectorize()) {
+    DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+    emitMissedWarning(F, L, Hints, ORE);
+    return false;
+  }
+
+  // Check the function attributes to find out if this function should be
+  // optimized for size.
+  bool OptForSize =
+      Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+
+  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
+  // count by optimizing for size, to minimize overheads.
   unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
   bool HasExpectedTC = (ExpectedTC > 0);
 
@@ -7802,36 +7850,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
-                 << "This loop is not worth vectorizing.");
+                 << "This loop is worth vectorizing only if no scalar "
+                 << "iteration overheads are incurred.");
     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
       DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
     else {
       DEBUG(dbgs() << "\n");
-      ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
-                                     "NotBeneficial", L)
-                << "vectorization is not beneficial "
-                   "and is not explicitly forced");
-      return false;
+      // Loops with a very small trip count are considered for vectorization
+      // under OptForSize, thereby making sure the cost of their loop body is
+      // dominant, free of runtime guards and scalar iteration overheads.
+      OptForSize = true;
     }
   }
 
-  PredicatedScalarEvolution PSE(*SE, *L);
-
-  // Check if it is legal to vectorize the loop.
-  LoopVectorizationRequirements Requirements(*ORE);
-  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
-                                &Requirements, &Hints);
-  if (!LVL.canVectorize()) {
-    DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
-    emitMissedWarning(F, L, Hints, ORE);
-    return false;
-  }
-
-  // Check the function attributes to find out if this function should be
-  // optimized for size.
-  bool OptForSize =
-      Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
-
   // Check the function attributes to see if implicit floats are allowed.
   // FIXME: This check doesn't seem possibly correct -- what if the loop is
   // an integer loop and the vector instructions selected are purely integer
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b267230d3185..b494526369d6 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -173,6 +173,11 @@ static unsigned getAltOpcode(unsigned Op) {
   }
 }
 
+/// true if the \p Value is odd, false otherwise.
+static bool isOdd(unsigned Value) {
+  return Value & 1;
+}
+
 ///\returns bool representing if Opcode \p Op can be part
 /// of an alternate sequence which can later be merged as
 /// a ShuffleVector instruction.
@@ -190,7 +195,7 @@ static unsigned isAltInst(ArrayRef<Value *> VL) {
   unsigned AltOpcode = getAltOpcode(Opcode);
   for (int i = 1, e = VL.size(); i < e; i++) {
     Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
+    if (!I || I->getOpcode() != (isOdd(i) ? AltOpcode : Opcode))
       return 0;
   }
   return Instruction::ShuffleVector;
@@ -504,7 +509,7 @@ private:
     Last->NeedToGather = !Vectorized;
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
-        assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
+        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
         ScalarToTreeEntry[VL[i]] = idx;
       }
     } else {
@@ -521,6 +526,20 @@ private:
   /// Holds all of the tree entries.
   std::vector<TreeEntry> VectorizableTree;
 
+  TreeEntry *getTreeEntry(Value *V) {
+    auto I = ScalarToTreeEntry.find(V);
+    if (I != ScalarToTreeEntry.end())
+      return &VectorizableTree[I->second];
+    return nullptr;
+  }
+
+  const TreeEntry *getTreeEntry(Value *V) const {
+    auto I = ScalarToTreeEntry.find(V);
+    if (I != ScalarToTreeEntry.end())
+      return &VectorizableTree[I->second];
+    return nullptr;
+  }
+
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, int> ScalarToTreeEntry;
 
@@ -1048,14 +1067,14 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   for (TreeEntry &EIdx : VectorizableTree) {
     TreeEntry *Entry = &EIdx;
 
+    // No need to handle users of gathered values.
+    if (Entry->NeedToGather)
+      continue;
+
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
 
-      // No need to handle users of gathered values.
-      if (Entry->NeedToGather)
-        continue;
-
       // Check if the scalar is externally used as an extra arg.
       auto ExtI = ExternallyUsedValues.find(Scalar);
       if (ExtI != ExternallyUsedValues.end()) {
@@ -1072,9 +1091,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
           continue;
 
         // Skip in-tree scalars that become vectors
-        if (ScalarToTreeEntry.count(U)) {
-          int Idx = ScalarToTreeEntry[U];
-          TreeEntry *UseEntry = &VectorizableTree[Idx];
+        if (TreeEntry *UseEntry = getTreeEntry(U)) {
           Value *UseScalar = UseEntry->Scalars[0];
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in Lane 0 will
@@ -1083,7 +1100,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
             DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                          << ".\n");
-            assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
+            assert(!UseEntry->NeedToGather && "Bad state");
             continue;
           }
         }
@@ -1156,9 +1173,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
 
   // Check if this is a duplicate of another entry.
-  if (ScalarToTreeEntry.count(VL[0])) {
-    int Idx = ScalarToTreeEntry[VL[0]];
-    TreeEntry *E = &VectorizableTree[Idx];
+  if (TreeEntry *E = getTreeEntry(VL[0])) {
     for (unsigned i = 0, e = VL.size(); i != e; ++i) {
       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
       if (E->Scalars[i] != VL[i]) {
@@ -1997,7 +2012,7 @@ int BoUpSLP::getSpillCost() {
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
+      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
@@ -2393,9 +2408,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
       CSEBlocks.insert(Insrt->getParent());
 
       // Add to our 'need-to-extract' list.
-      if (ScalarToTreeEntry.count(VL[i])) {
-        int Idx = ScalarToTreeEntry[VL[i]];
-        TreeEntry *E = &VectorizableTree[Idx];
+      if (TreeEntry *E = getTreeEntry(VL[i])) {
         // Find which lane we need to extract.
         int FoundLane = -1;
         for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
@@ -2415,11 +2428,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
 }
 
 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
-  SmallDenseMap<Value*, int>::const_iterator Entry
-    = ScalarToTreeEntry.find(VL[0]);
-  if (Entry != ScalarToTreeEntry.end()) {
-    int Idx = Entry->second;
-    const TreeEntry *En = &VectorizableTree[Idx];
+  if (const TreeEntry *En = getTreeEntry(VL[0])) {
     if (En->isSame(VL) && En->VectorizedValue)
       return En->VectorizedValue;
   }
@@ -2427,12 +2436,9 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
 }
 
 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
-  if (ScalarToTreeEntry.count(VL[0])) {
-    int Idx = ScalarToTreeEntry[VL[0]];
-    TreeEntry *E = &VectorizableTree[Idx];
+  if (TreeEntry *E = getTreeEntry(VL[0]))
     if (E->isSame(VL))
       return vectorizeTree(E);
-  }
 
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -2667,9 +2673,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // The pointer operand uses an in-tree scalar so we add the new BitCast to
       // ExternalUses list to make sure that an extract will be generated in the
       // future.
-      if (ScalarToTreeEntry.count(LI->getPointerOperand()))
-        ExternalUses.push_back(
-            ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
+      Value *PO = LI->getPointerOperand();
+      if (getTreeEntry(PO))
+        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
       unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
@@ -2700,9 +2706,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // The pointer operand uses an in-tree scalar so we add the new BitCast to
       // ExternalUses list to make sure that an extract will be generated in the
       // future.
-      if (ScalarToTreeEntry.count(SI->getPointerOperand()))
-        ExternalUses.push_back(
-            ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
+      Value *PO = SI->getPointerOperand();
+      if (getTreeEntry(PO))
+        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
       if (!Alignment) {
         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
@@ -2783,7 +2789,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // The scalar argument uses an in-tree scalar so we add the new vectorized
       // call to ExternalUses list to make sure that an extract will be
       // generated in the future.
-      if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
+      if (ScalarArg && getTreeEntry(ScalarArg))
         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
 
       E->VectorizedValue = V;
@@ -2819,7 +2825,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       unsigned e = E->Scalars.size();
       SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
-        if (i & 1) {
+        if (isOdd(i)) {
           Mask[i] = Builder.getInt32(e + i);
           OddScalars.push_back(E->Scalars[i]);
         } else {
@@ -2897,10 +2903,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
     // has multiple uses of the same value.
     if (User && !is_contained(Scalar->users(), User))
       continue;
-    assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
-
-    int Idx = ScalarToTreeEntry[Scalar];
-    TreeEntry *E = &VectorizableTree[Idx];
+    TreeEntry *E = getTreeEntry(Scalar);
+    assert(E && "Invalid scalar");
     assert(!E->NeedToGather && "Extracting from a gather list");
 
     Value *Vec = E->VectorizedValue;
@@ -2986,7 +2990,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
         for (User *U : Scalar->users()) {
           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
-          assert((ScalarToTreeEntry.count(U) ||
+          assert((getTreeEntry(U) ||
                   // It is legal to replace users in the ignorelist by undef.
                   is_contained(UserIgnoreList, U)) &&
                  "Replacing out-of-tree value with undef");
@@ -3449,7 +3453,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
        I = I->getNextNode()) {
     ScheduleData *SD = BS->getScheduleData(I);
     assert(
-        SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
+        SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) &&
         "scheduler and vectorizer have different opinion on what is a bundle");
     SD->FirstInBundle->SchedulingPriority = Idx++;
     if (SD->isSchedulingEntity()) {
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index a21928317888..fb2f509dcbaa 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -26,7 +26,6 @@ using namespace llvm;
 /// initializeVectorizationPasses - Initialize all passes linked into the
 /// Vectorization library.
 void llvm::initializeVectorization(PassRegistry &Registry) {
-  initializeBBVectorizePass(Registry);
   initializeLoopVectorizePass(Registry);
   initializeSLPVectorizerPass(Registry);
   initializeLoadStoreVectorizerPass(Registry);
@@ -36,8 +35,8 @@ void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
   initializeVectorization(*unwrap(R));
 }
 
+// DEPRECATED: Remove after the LLVM 5 release.
 void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createBBVectorizePass());
 }
 
 void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
author	Dimitry Andric <dim@FreeBSD.org>	2017-07-01 13:22:02 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-07-01 13:22:02 +0000
commit	9df3605dea17e84f8183581f6103bd0c79e2a606 (patch)
tree	70a2f36ce9eb9bb213603cd7f2f120af53fc176f /lib
parent	08bbd35a80bf7765fe0d3043f9eb5a2f2786b649 (diff)