625 files changed, 23355 insertions, 10841 deletions
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index d030f74481cf..49199060786c 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -249,11 +249,11 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call,
     bool IsMustAlias = true;
     ModRefInfo AllArgsMask = ModRefInfo::NoModRef;
     if (doesAccessArgPointees(MRB)) {
-      for (auto AI = Call->arg_begin(), AE = Call->arg_end(); AI != AE; ++AI) {
-        const Value *Arg = *AI;
+      for (const auto &I : llvm::enumerate(Call->args())) {
+        const Value *Arg = I.value();
         if (!Arg->getType()->isPointerTy())
           continue;
-        unsigned ArgIdx = std::distance(Call->arg_begin(), AI);
+        unsigned ArgIdx = I.index();
         MemoryLocation ArgLoc =
             MemoryLocation::getForArgument(Call, ArgIdx, TLI);
         AliasResult ArgAlias = alias(ArgLoc, Loc, AAQI);
@@ -696,14 +696,16 @@ ModRefInfo AAResults::getModRefInfo(const Instruction *I,
   case Instruction::AtomicRMW:
     return getModRefInfo((const AtomicRMWInst *)I, Loc, AAQIP);
   case Instruction::Call:
-    return getModRefInfo((const CallInst *)I, Loc, AAQIP);
+  case Instruction::CallBr:
   case Instruction::Invoke:
-    return getModRefInfo((const InvokeInst *)I, Loc, AAQIP);
+    return getModRefInfo((const CallBase *)I, Loc, AAQIP);
   case Instruction::CatchPad:
     return getModRefInfo((const CatchPadInst *)I, Loc, AAQIP);
   case Instruction::CatchRet:
     return getModRefInfo((const CatchReturnInst *)I, Loc, AAQIP);
   default:
+    assert(!I->mayReadOrWriteMemory() &&
+           "Unhandled memory access instruction!");
     return ModRefInfo::NoModRef;
   }
 }
diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp
index db5167061509..177f38af13d8 100644
--- a/llvm/lib/Analysis/Analysis.cpp
+++ b/llvm/lib/Analysis/Analysis.cpp
@@ -35,6 +35,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeCFGOnlyPrinterLegacyPassPass(Registry);
   initializeCFLAndersAAWrapperPassPass(Registry);
   initializeCFLSteensAAWrapperPassPass(Registry);
+  initializeCycleInfoWrapperPassPass(Registry);
   initializeDependenceAnalysisWrapperPassPass(Registry);
   initializeDelinearizationPass(Registry);
   initializeDemandedBitsWrapperPassPass(Registry);
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 88b0f37b1d48..5f1bf2001d47 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1699,6 +1699,7 @@ AliasResult BasicAAResult::aliasCheckRecursive(
       return Result;
   } else if (const GEPOperator *GV2 = dyn_cast<GEPOperator>(V2)) {
     AliasResult Result = aliasGEP(GV2, V2Size, V1, V1Size, O2, O1, AAQI);
+    Result.swap();
     if (Result != AliasResult::MayAlias)
       return Result;
   }
@@ -1709,6 +1710,7 @@ AliasResult BasicAAResult::aliasCheckRecursive(
       return Result;
   } else if (const PHINode *PN = dyn_cast<PHINode>(V2)) {
     AliasResult Result = aliasPHI(PN, V2Size, V1, V1Size, AAQI);
+    Result.swap();
     if (Result != AliasResult::MayAlias)
       return Result;
   }
@@ -1719,6 +1721,7 @@ AliasResult BasicAAResult::aliasCheckRecursive(
       return Result;
   } else if (const SelectInst *S2 = dyn_cast<SelectInst>(V2)) {
     AliasResult Result = aliasSelect(S2, V2Size, V1, V1Size, AAQI);
+    Result.swap();
     if (Result != AliasResult::MayAlias)
       return Result;
   }
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index 8955658cb9e7..9b45f455be08 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -346,13 +346,16 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
           if (Tracker->captured(U))
             return;
 
-      // Not captured if only passed via 'nocapture' arguments.  Note that
-      // calling a function pointer does not in itself cause the pointer to
+      // Calling a function pointer does not in itself cause the pointer to
       // be captured.  This is a subtle point considering that (for example)
       // the callee might return its own address.  It is analogous to saying
       // that loading a value from a pointer does not cause the pointer to be
       // captured, even though the loaded value might be the pointer itself
       // (think of self-referential objects).
+      if (Call->isCallee(U))
+        break;
+
+      // Not captured if only passed via 'nocapture' arguments.
       if (Call->isDataOperand(U) &&
           !Call->doesNotCapture(Call->getDataOperandNo(U))) {
         // The parameter is not marked 'nocapture' - captured.
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 3ed3b8902343..922b38e92785 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -352,6 +352,9 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
                                          const DataLayout &DL) {
   do {
     Type *SrcTy = C->getType();
+    if (SrcTy == DestTy)
+      return C;
+
     TypeSize DestSize = DL.getTypeSizeInBits(DestTy);
     TypeSize SrcSize = DL.getTypeSizeInBits(SrcTy);
     if (!TypeSize::isKnownGE(SrcSize, DestSize))
@@ -705,7 +708,8 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
   // is all undef or zero, we know what it loads.
   if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(C))) {
     if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
-      if (GV->getInitializer()->isNullValue())
+      if (GV->getInitializer()->isNullValue() && !Ty->isX86_MMXTy() &&
+          !Ty->isX86_AMXTy())
         return Constant::getNullValue(Ty);
       if (isa<UndefValue>(GV->getInitializer()))
         return UndefValue::get(Ty);
@@ -881,7 +885,7 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
     InnermostGEP = GEP;
     InBounds &= GEP->isInBounds();
 
-    SmallVector<Value *, 4> NestedOps(GEP->op_begin() + 1, GEP->op_end());
+    SmallVector<Value *, 4> NestedOps(llvm::drop_begin(GEP->operands()));
 
     // Do not try the incorporate the sub-GEP if some index is not a number.
     bool AllConstantInt = true;
@@ -1774,15 +1778,8 @@ static bool mayFoldConstrained(ConstrainedFPIntrinsic *CI,
 
   // If the operation does not change exception status flags, it is safe
   // to fold.
-  if (St == APFloat::opStatus::opOK) {
-    // When FP exceptions are not ignored, intrinsic call will not be
-    // eliminated, because it is considered as having side effect. But we
-    // know that its evaluation does not raise exceptions, so side effect
-    // is absent. To allow removing the call, mark it as not accessing memory.
-    if (EB && *EB != fp::ExceptionBehavior::ebIgnore)
-      CI->addFnAttr(Attribute::ReadNone);
+  if (St == APFloat::opStatus::opOK)
     return true;
-  }
 
   // If evaluation raised FP exception, the result can depend on rounding
   // mode. If the latter is unknown, folding is not possible.
@@ -2960,10 +2957,6 @@ static Constant *ConstantFoldFixedVectorCall(
     if (auto *Op = dyn_cast<ConstantInt>(Operands[0])) {
       unsigned Lanes = FVTy->getNumElements();
       uint64_t Limit = Op->getZExtValue();
-      // vctp64 are currently modelled as returning a v4i1, not a v2i1. Make
-      // sure we get the limit right in that case and set all relevant lanes.
-      if (IntrinsicID == Intrinsic::arm_mve_vctp64)
-        Limit *= 2;
 
       SmallVector<Constant *, 16> NCs;
       for (unsigned i = 0; i < Lanes; i++) {
diff --git a/llvm/lib/Analysis/CycleAnalysis.cpp b/llvm/lib/Analysis/CycleAnalysis.cpp
new file mode 100644
index 000000000000..09c7ee67e05c
--- /dev/null
+++ b/llvm/lib/Analysis/CycleAnalysis.cpp
@@ -0,0 +1,77 @@
+//===- CycleAnalysis.cpp - Compute CycleInfo for LLVM IR ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CycleAnalysis.h"
+#include "llvm/ADT/GenericCycleImpl.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+template class llvm::GenericCycleInfo<SSAContext>;
+template class llvm::GenericCycle<SSAContext>;
+
+CycleInfo CycleAnalysis::run(Function &F, FunctionAnalysisManager &) {
+  CycleInfo CI;
+  CI.compute(F);
+  return CI;
+}
+
+AnalysisKey CycleAnalysis::Key;
+
+CycleInfoPrinterPass::CycleInfoPrinterPass(raw_ostream &OS) : OS(OS) {}
+
+PreservedAnalyses CycleInfoPrinterPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  OS << "CycleInfo for function: " << F.getName() << "\n";
+  AM.getResult<CycleAnalysis>(F).print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+//===----------------------------------------------------------------------===//
+//  CycleInfoWrapperPass Implementation
+//===----------------------------------------------------------------------===//
+//
+// The implementation details of the wrapper pass that holds a CycleInfo
+// suitable for use with the legacy pass manager.
+//
+//===----------------------------------------------------------------------===//
+
+char CycleInfoWrapperPass::ID = 0;
+
+CycleInfoWrapperPass::CycleInfoWrapperPass() : FunctionPass(ID) {
+  initializeCycleInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(CycleInfoWrapperPass, "cycles", "Cycle Info Analysis",
+                      true, true)
+INITIALIZE_PASS_END(CycleInfoWrapperPass, "cycles", "Cycle Info Analysis", true,
+                    true)
+
+void CycleInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
+
+bool CycleInfoWrapperPass::runOnFunction(Function &Func) {
+  CI.clear();
+
+  F = &Func;
+  CI.compute(Func);
+  return false;
+}
+
+void CycleInfoWrapperPass::print(raw_ostream &OS, const Module *) const {
+  OS << "CycleInfo for function: " << F->getName() << "\n";
+  CI.print(OS);
+}
+
+void CycleInfoWrapperPass::releaseMemory() {
+  CI.clear();
+  F = nullptr;
+}
diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
index d87fa849d839..31b2dafa29b4 100644
--- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
@@ -16,6 +16,8 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
 #include "llvm/Analysis/MLInlineAdvisor.h"
+#include "llvm/Analysis/ModelUnderTrainingRunner.h"
+#include "llvm/Analysis/NoInferenceModelRunner.h"
 #include "llvm/Analysis/Utils/TFUtils.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
@@ -94,7 +96,6 @@ struct InlineEvent {
 /// Because this is a protobuf, we cannot just stream the events as they come.
 /// Internally, TrainingLogger stores data in column-major format, because that
 /// lines up with how TF SequenceExample represents it.
-class ModelUnderTrainingRunner;
 class TrainingLogger final {
 public:
   TrainingLogger(StringRef LogFileName, const ModelUnderTrainingRunner *MUTR);
@@ -261,65 +262,21 @@ private:
   const int64_t Mandatory;
 };
 
-/// A pseudo model runner. We use it to store feature values when collecting
-/// logs for the default policy, but never ask it to 'run'.
-class NoInferenceModelRunner : public MLModelRunner {
-public:
-  NoInferenceModelRunner(LLVMContext &Ctx)
-      : MLModelRunner(Ctx), Features(NumberOfFeatures) {}
-  void setFeature(FeatureIndex Index, int64_t Value) override {
-    Features[static_cast<int>(Index)] = Value;
-  }
-
-  int64_t getFeature(int Index) const override { return Features[Index]; }
-  bool run() override {
-    llvm_unreachable("We shouldn't call run on this model runner.");
-  }
-
-private:
-  InlineFeatures Features;
-};
-
-/// ModelUnderTrainingRunner - training mode implementation. It uses TF C APIs
-/// to dynamically load and evaluate a TF SavedModel
-/// (https://www.tensorflow.org/guide/saved_model). Runtime performance is
-/// sacrificed for ease of use while training.
-class ModelUnderTrainingRunner final : public MLModelRunner {
-public:
-  ModelUnderTrainingRunner(LLVMContext &Ctx, const std::string &ModelPath);
-
-  bool run() override;
+static const std::vector<TensorSpec> TrainingOnlyFeatures{
+    TensorSpec::createSpec<int64_t>(TFFeedPrefix + "inlining_default", {1}),
+    TensorSpec::createSpec<float>(TFFeedPrefix + "discount", {1}),
+    TensorSpec::createSpec<float>(TFFeedPrefix + "reward", {1}),
+    TensorSpec::createSpec<int32_t>(TFFeedPrefix + "step_type", {1})};
 
-  // Disallows copy and assign.
-  ModelUnderTrainingRunner(const ModelUnderTrainingRunner &) = delete;
-  ModelUnderTrainingRunner &
-  operator=(const ModelUnderTrainingRunner &) = delete;
-
-  void setFeature(FeatureIndex Index, int64_t Value) override;
-  int64_t getFeature(int Index) const override;
-  bool isValid() const { return !!Evaluator; }
-
-  const std::vector<LoggedFeatureSpec> &outputLoggedFeatureSpecs() const {
-    return OutputSpecs;
-  }
-
-  const Optional<TFModelEvaluator::EvaluationResult> &
-  lastEvaluationResult() const {
-    return LastEvaluationResult;
-  }
-
-private:
-  std::unique_ptr<TFModelEvaluator> Evaluator;
-  std::vector<LoggedFeatureSpec> OutputSpecs;
-  Optional<TFModelEvaluator::EvaluationResult> LastEvaluationResult;
+static const std::vector<TensorSpec> getInputFeatures() {
+  std::vector<TensorSpec> InputSpecs;
+  for (size_t I = 0; I < NumberOfFeatures; ++I)
+    InputSpecs.push_back(
+        TensorSpec::createSpec<int64_t>(TFFeedPrefix + FeatureNameMap[I], {1}));
+  append_range(InputSpecs, TrainingOnlyFeatures);
+  return InputSpecs;
+}
 
-  // The training framework needs some additional features.
-  const std::vector<TensorSpec> TrainingOnlyFeatures{
-      TensorSpec::createSpec<int64_t>(TFFeedPrefix + "inlining_default", {1}),
-      TensorSpec::createSpec<float>(TFFeedPrefix + "discount", {1}),
-      TensorSpec::createSpec<float>(TFFeedPrefix + "reward", {1}),
-      TensorSpec::createSpec<int32_t>(TFFeedPrefix + "step_type", {1})};
-};
 } // namespace
 
 TrainingLogger::TrainingLogger(StringRef LogFileName,
@@ -353,7 +310,7 @@ void TrainingLogger::logInlineEvent(const InlineEvent &Event,
                                     const MLModelRunner &ModelRunner) {
   size_t CurrentFeature = 0;
   for (; CurrentFeature < NumberOfFeatures; ++CurrentFeature) {
-    int64_t F = ModelRunner.getFeature(CurrentFeature);
+    int64_t F = *ModelRunner.getTensor<int64_t>(CurrentFeature);
     L->logInt64Value(CurrentFeature, &F);
   }
 
@@ -433,7 +390,9 @@ DevelopmentModeMLInlineAdvisor::getAdviceFromModel(
     return MLInlineAdvisor::getAdviceFromModel(CB, ORE);
 
   bool DefaultAdvice = GetDefaultAdvice(CB);
-  auto Recommendation = IsDoingInference ? ModelRunner->run() : DefaultAdvice;
+  auto Recommendation =
+      IsDoingInference ? static_cast<bool>(ModelRunner->evaluate<int64_t>())
+                       : DefaultAdvice;
   return std::make_unique<LoggingMLInlineAdvice>(
       /*Advisor=*/this,
       /*CB=*/CB, /*ORE=*/ORE, /*Recommendation=*/Recommendation,
@@ -458,49 +417,6 @@ size_t DevelopmentModeMLInlineAdvisor::getTotalSizeEstimate() {
   return Ret;
 }
 
-ModelUnderTrainingRunner::ModelUnderTrainingRunner(LLVMContext &Ctx,
-                                                   const std::string &ModelPath)
-    : MLModelRunner(Ctx) {
-  std::vector<TensorSpec> InputSpecs;
-  for (size_t I = 0; I < NumberOfFeatures; ++I)
-    InputSpecs.push_back(
-        TensorSpec::createSpec<int64_t>(TFFeedPrefix + FeatureNameMap[I], {1}));
-  append_range(InputSpecs, TrainingOnlyFeatures);
-  if (auto MaybeOutSpecs =
-          loadOutputSpecs(Ctx, DecisionName, ModelPath, TFOutputSpecOverride))
-    OutputSpecs = std::move(*MaybeOutSpecs);
-  else
-    return;
-
-  Evaluator = std::make_unique<TFModelEvaluator>(
-      ModelPath, InputSpecs, [&](size_t I) { return OutputSpecs[I].Spec; },
-      OutputSpecs.size());
-  if (!Evaluator || !Evaluator->isValid()) {
-    Ctx.emitError("Failed to create inliner saved model evaluator");
-    Evaluator.reset();
-    return;
-  }
-}
-
-bool ModelUnderTrainingRunner::run() {
-  LastEvaluationResult = Evaluator->evaluate();
-  if (!LastEvaluationResult.hasValue()) {
-    Ctx.emitError("Error evaluating model.");
-    return false;
-  }
-  int64_t Decision = *LastEvaluationResult->getTensorValue<int64_t>(0);
-  return static_cast<bool>(Decision);
-}
-
-int64_t ModelUnderTrainingRunner::getFeature(int Index) const {
-  return *Evaluator->getInput<int64_t>(Index);
-}
-
-void ModelUnderTrainingRunner::setFeature(FeatureIndex Index, int64_t Value) {
-  size_t NumericIndex = static_cast<size_t>(Index);
-  *(Evaluator->getInput<int64_t>(NumericIndex)) = Value;
-}
-
 std::unique_ptr<InlineAdvisor> llvm::getDevelopmentModeAdvisor(
     Module &M, ModuleAnalysisManager &MAM,
     std::function<bool(CallBase &)> GetDefaultAdvice) {
@@ -509,10 +425,13 @@ std::unique_ptr<InlineAdvisor> llvm::getDevelopmentModeAdvisor(
   ModelUnderTrainingRunner *MUTRPtr = nullptr;
   bool IsDoingInference = false;
   if (TFModelUnderTrainingPath.empty())
-    Runner.reset(new NoInferenceModelRunner(Ctx));
+    Runner.reset(new NoInferenceModelRunner(Ctx, getInputFeatures()));
   else {
-    auto MUTR = std::make_unique<ModelUnderTrainingRunner>(
-        Ctx, TFModelUnderTrainingPath);
+    std::unique_ptr<ModelUnderTrainingRunner> MUTR;
+    if (auto MaybeOutputSpecs = loadOutputSpecs(
+            Ctx, DecisionName, TFModelUnderTrainingPath, TFOutputSpecOverride))
+      MUTR = std::make_unique<ModelUnderTrainingRunner>(
+          Ctx, TFModelUnderTrainingPath, getInputFeatures(), *MaybeOutputSpecs);
     if (!MUTR || !MUTR->isValid()) {
       Ctx.emitError("Could not load the policy model from the provided path");
       return nullptr;
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index cfe910df4e91..f5fa6748d053 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -933,7 +933,7 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
 Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp,
-                                                   FastMathFlags FMF) {
+                                                   FastMathFlags FMF) const {
   switch (K) {
   case RecurKind::Xor:
   case RecurKind::Add:
diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
index 73d1eff1b968..140c88eb8b0d 100644
--- a/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -40,6 +40,10 @@ static cl::opt<bool>
                                    " callsites processed by inliner but decided"
                                    " to be not inlined"));
 
+static cl::opt<bool> EnableInlineDeferral("inline-deferral", cl::init(false),
+                                          cl::Hidden,
+                                          cl::desc("Enable deferred inlining"));
+
 // An integer used to limit the cost of inline deferral.  The default negative
 // number tells shouldBeDeferred to only take the secondary cost into account.
 static cl::opt<int>
@@ -136,8 +140,9 @@ llvm::Optional<llvm::InlineCost> static getDefaultInlineAdvice(
     return getInlineCost(CB, Params, CalleeTTI, GetAssumptionCache, GetTLI,
                          GetBFI, PSI, RemarksEnabled ? &ORE : nullptr);
   };
-  return llvm::shouldInline(CB, GetInlineCost, ORE,
-                            Params.EnableDeferral.getValueOr(false));
+  return llvm::shouldInline(
+      CB, GetInlineCost, ORE,
+      Params.EnableDeferral.getValueOr(EnableInlineDeferral));
 }
 
 std::unique_ptr<InlineAdvice>
@@ -409,8 +414,6 @@ llvm::shouldInline(CallBase &CB,
              << "' in other contexts";
     });
     setInlineRemark(CB, "deferred");
-    // IC does not bool() to false, so get an InlineCost that will.
-    // This will not be inspected to make an error message.
     return None;
   }
 
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 22d2ce11cc90..4831b22b1d46 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2173,6 +2173,15 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     }
   }
 
+  // ((X | Y) ^ X ) & ((X | Y) ^ Y) --> 0
+  // ((X | Y) ^ Y ) & ((X | Y) ^ X) --> 0
+  BinaryOperator *Or;
+  if (match(Op0, m_c_Xor(m_Value(X),
+                         m_CombineAnd(m_BinOp(Or),
+                                      m_c_Or(m_Deferred(X), m_Value(Y))))) &&
+      match(Op1, m_c_Xor(m_Specific(Or), m_Specific(Y))))
+    return Constant::getNullValue(Op0->getType());
+
   return nullptr;
 }
 
@@ -2198,6 +2207,18 @@ static Value *simplifyOrLogic(Value *X, Value *Y) {
 
   Value *A, *B;
 
+  // (A ^ B) | (A | B) --> A | B
+  // (A ^ B) | (B | A) --> B | A
+  if (match(X, m_Xor(m_Value(A), m_Value(B))) &&
+      match(Y, m_c_Or(m_Specific(A), m_Specific(B))))
+    return Y;
+
+  // ~(A ^ B) | (A | B) --> -1
+  // ~(A ^ B) | (B | A) --> -1
+  if (match(X, m_Not(m_Xor(m_Value(A), m_Value(B)))) &&
+      match(Y, m_c_Or(m_Specific(A), m_Specific(B))))
+    return ConstantInt::getAllOnesValue(Ty);
+
   // (A & ~B) | (A ^ B) --> A ^ B
   // (~B & A) | (A ^ B) --> A ^ B
   // (A & ~B) | (B ^ A) --> B ^ A
@@ -2214,18 +2235,33 @@ static Value *simplifyOrLogic(Value *X, Value *Y) {
       match(Y, m_c_And(m_Specific(A), m_Specific(B))))
     return X;
 
-  // (A ^ B) | (A | B) --> A | B
-  // (A ^ B) | (B | A) --> B | A
-  if (match(X, m_Xor(m_Value(A), m_Value(B))) &&
-      match(Y, m_c_Or(m_Specific(A), m_Specific(B))))
-    return Y;
-
-  // ~(A ^ B) | (A | B) --> -1
-  // ~(A ^ B) | (B | A) --> -1
-  if (match(X, m_Not(m_Xor(m_Value(A), m_Value(B)))) &&
-      match(Y, m_c_Or(m_Specific(A), m_Specific(B))))
+  // (~A | B) | (A ^ B) --> -1
+  // (~A | B) | (B ^ A) --> -1
+  // (B | ~A) | (A ^ B) --> -1
+  // (B | ~A) | (B ^ A) --> -1
+  if (match(X, m_c_Or(m_Not(m_Value(A)), m_Value(B))) &&
+      match(Y, m_c_Xor(m_Specific(A), m_Specific(B))))
     return ConstantInt::getAllOnesValue(Ty);
 
+  // (~A & B) | ~(A | B) --> ~A
+  // (~A & B) | ~(B | A) --> ~A
+  // (B & ~A) | ~(A | B) --> ~A
+  // (B & ~A) | ~(B | A) --> ~A
+  Value *NotA;
+  if (match(X,
+            m_c_And(m_CombineAnd(m_Value(NotA), m_NotForbidUndef(m_Value(A))),
+                    m_Value(B))) &&
+      match(Y, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
+    return NotA;
+
+  // ~(A ^ B) | (A & B) --> ~(A & B)
+  // ~(A ^ B) | (B & A) --> ~(A & B)
+  Value *NotAB;
+  if (match(X, m_CombineAnd(m_NotForbidUndef(m_Xor(m_Value(A), m_Value(B))),
+                            m_Value(NotAB))) &&
+      match(Y, m_c_And(m_Specific(A), m_Specific(B))))
+    return NotAB;
+
   return nullptr;
 }
 
@@ -2259,27 +2295,6 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   if (Value *V = simplifyLogicOfAddSub(Op0, Op1, Instruction::Or))
     return V;
 
-  Value *A, *B, *NotA;
-
-  // (~A & B) | ~(A | B) --> ~A
-  // (~A & B) | ~(B | A) --> ~A
-  // (B & ~A) | ~(A | B) --> ~A
-  // (B & ~A) | ~(B | A) --> ~A
-  if (match(Op0, m_c_And(m_CombineAnd(m_Value(NotA), m_Not(m_Value(A))),
-                         m_Value(B))) &&
-      match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
-    return NotA;
-
-  // Commute the 'or' operands.
-  // ~(A | B) | (~A & B) --> ~A
-  // ~(B | A) | (~A & B) --> ~A
-  // ~(A | B) | (B & ~A) --> ~A
-  // ~(B | A) | (B & ~A) --> ~A
-  if (match(Op1, m_c_And(m_CombineAnd(m_Value(NotA), m_Not(m_Value(A))),
-                         m_Value(B))) &&
-      match(Op0, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
-    return NotA;
-
   // Rotated -1 is still -1:
   // (-1 << X) | (-1 >> (C - X)) --> -1
   // (-1 >> X) | (-1 << (C - X)) --> -1
@@ -2335,6 +2350,7 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
   }
 
   // (A & C1)|(B & C2)
+  Value *A, *B;
   const APInt *C1, *C2;
   if (match(Op0, m_And(m_Value(A), m_APInt(C1))) &&
       match(Op1, m_And(m_Value(B), m_APInt(C2)))) {
@@ -2696,9 +2712,17 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
   if (!OpTy->isIntOrIntVectorTy(1))
     return nullptr;
 
-  // A boolean compared to true/false can be simplified in 14 out of the 20
-  // (10 predicates * 2 constants) possible combinations. Cases not handled here
-  // require a 'not' of the LHS, so those must be transformed in InstCombine.
+  // A boolean compared to true/false can be reduced in 14 out of the 20
+  // (10 predicates * 2 constants) possible combinations. The other
+  // 6 cases require a 'not' of the LHS.
+
+  auto ExtractNotLHS = [](Value *V) -> Value * {
+    Value *X;
+    if (match(V, m_Not(m_Value(X))))
+      return X;
+    return nullptr;
+  };
+
   if (match(RHS, m_Zero())) {
     switch (Pred) {
     case CmpInst::ICMP_NE:  // X !=  0 -> X
@@ -2706,6 +2730,13 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
     case CmpInst::ICMP_SLT: // X <s  0 -> X
       return LHS;
 
+    case CmpInst::ICMP_EQ:  // not(X) ==  0 -> X != 0 -> X
+    case CmpInst::ICMP_ULE: // not(X) <=u 0 -> X >u 0 -> X
+    case CmpInst::ICMP_SGE: // not(X) >=s 0 -> X <s 0 -> X
+      if (Value *X = ExtractNotLHS(LHS))
+        return X;
+      break;
+
     case CmpInst::ICMP_ULT: // X <u  0 -> false
     case CmpInst::ICMP_SGT: // X >s  0 -> false
       return getFalse(ITy);
@@ -2723,6 +2754,13 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
     case CmpInst::ICMP_SLE: // X <=s -1 -> X
       return LHS;
 
+    case CmpInst::ICMP_NE:  // not(X) !=  1 -> X ==   1 -> X
+    case CmpInst::ICMP_ULT: // not(X) <=u 1 -> X >=u  1 -> X
+    case CmpInst::ICMP_SGT: // not(X) >s  1 -> X <=s -1 -> X
+      if (Value *X = ExtractNotLHS(LHS))
+        return X;
+      break;
+
     case CmpInst::ICMP_UGT: // X >u   1 -> false
     case CmpInst::ICMP_SLT: // X <s  -1 -> false
       return getFalse(ITy);
@@ -5887,9 +5925,9 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
       auto Attr = Call->getFunction()->getFnAttribute(Attribute::VScaleRange);
       if (!Attr.isValid())
         return nullptr;
-      unsigned VScaleMin, VScaleMax;
-      std::tie(VScaleMin, VScaleMax) = Attr.getVScaleRangeArgs();
-      if (VScaleMin == VScaleMax && VScaleMax != 0)
+      unsigned VScaleMin = Attr.getVScaleRangeMin();
+      Optional<unsigned> VScaleMax = Attr.getVScaleRangeMax();
+      if (VScaleMax && VScaleMin == VScaleMax)
         return ConstantInt::get(F->getReturnType(), VScaleMin);
       return nullptr;
     }
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 19a24ac6a484..6444518dc70c 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1568,11 +1568,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
 
   auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
   uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
+  bool HasSameSize =
+      DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
   uint64_t Stride = std::abs(StrideAPtr);
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
   if (!C) {
-    if (!isa<SCEVCouldNotCompute>(Dist) &&
-        TypeByteSize == DL.getTypeAllocSize(BTy) &&
+    if (!isa<SCEVCouldNotCompute>(Dist) && HasSameSize &&
         isSafeDependenceDistance(DL, *(PSE.getSE()),
                                  *(PSE.getBackedgeTakenCount()), *Dist, Stride,
                                  TypeByteSize))
@@ -1587,7 +1588,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   int64_t Distance = Val.getSExtValue();
 
   // Attempt to prove strided accesses independent.
-  if (std::abs(Distance) > 0 && Stride > 1 && ATy == BTy &&
+  if (std::abs(Distance) > 0 && Stride > 1 && HasSameSize &&
       areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
     LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
     return Dependence::NoDep;
@@ -1598,7 +1599,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
     if (IsTrueDataDependence && EnableForwardingConflictDetection &&
         (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
-         ATy != BTy)) {
+         !HasSameSize)) {
       LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
       return Dependence::ForwardButPreventsForwarding;
     }
@@ -1608,21 +1609,19 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   }
 
   // Write to the same location with the same size.
-  // Could be improved to assert type sizes are the same (i32 == float, etc).
   if (Val == 0) {
-    if (ATy == BTy)
+    if (HasSameSize)
       return Dependence::Forward;
     LLVM_DEBUG(
-        dbgs() << "LAA: Zero dependence difference but different types\n");
+        dbgs() << "LAA: Zero dependence difference but different type sizes\n");
     return Dependence::Unknown;
   }
 
   assert(Val.isStrictlyPositive() && "Expect a positive value");
 
-  if (ATy != BTy) {
-    LLVM_DEBUG(
-        dbgs()
-        << "LAA: ReadWrite-Write positive dependency with different types\n");
+  if (!HasSameSize) {
+    LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
+                         "different type sizes\n");
     return Dependence::Unknown;
   }
 
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index 6fc4c42bdd71..f5a65cd2b689 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -35,6 +35,21 @@
 
 using namespace llvm;
 
+#ifdef LLVM_HAVE_TF_AOT
+#include "llvm/Analysis/ReleaseModeModelRunner.h"
+// codegen-ed file
+#include "InlinerSizeModel.h" // NOLINT
+#include "llvm/Analysis/InlineModelFeatureMaps.h"
+
+std::unique_ptr<InlineAdvisor>
+llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM) {
+  auto AOTRunner =
+      std::make_unique<ReleaseModeModelRunner<llvm::InlinerSizeModel>>(
+          M.getContext(), FeatureNameMap, DecisionName);
+  return std::make_unique<MLInlineAdvisor>(M, MAM, std::move(AOTRunner));
+}
+#endif
+
 #define DEBUG_TYPE "inline-ml"
 
 static cl::opt<float> SizeIncreaseThreshold(
@@ -245,29 +260,32 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
   auto &CallerBefore = FAM.getResult<FunctionPropertiesAnalysis>(Caller);
   auto &CalleeBefore = FAM.getResult<FunctionPropertiesAnalysis>(Callee);
 
-  ModelRunner->setFeature(FeatureIndex::CalleeBasicBlockCount,
-                          CalleeBefore.BasicBlockCount);
-  ModelRunner->setFeature(FeatureIndex::CallSiteHeight,
-                          FunctionLevels[&Caller]);
-  ModelRunner->setFeature(FeatureIndex::NodeCount, NodeCount);
-  ModelRunner->setFeature(FeatureIndex::NrCtantParams, NrCtantParams);
-  ModelRunner->setFeature(FeatureIndex::EdgeCount, EdgeCount);
-  ModelRunner->setFeature(FeatureIndex::CallerUsers, CallerBefore.Uses);
-  ModelRunner->setFeature(FeatureIndex::CallerConditionallyExecutedBlocks,
-                          CallerBefore.BlocksReachedFromConditionalInstruction);
-  ModelRunner->setFeature(FeatureIndex::CallerBasicBlockCount,
-                          CallerBefore.BasicBlockCount);
-  ModelRunner->setFeature(FeatureIndex::CalleeConditionallyExecutedBlocks,
-                          CalleeBefore.BlocksReachedFromConditionalInstruction);
-  ModelRunner->setFeature(FeatureIndex::CalleeUsers, CalleeBefore.Uses);
-  ModelRunner->setFeature(FeatureIndex::CostEstimate, CostEstimate);
+  *ModelRunner->getTensor<int64_t>(FeatureIndex::CalleeBasicBlockCount) =
+      CalleeBefore.BasicBlockCount;
+  *ModelRunner->getTensor<int64_t>(FeatureIndex::CallSiteHeight) =
+      FunctionLevels[&Caller];
+  *ModelRunner->getTensor<int64_t>(FeatureIndex::NodeCount) = NodeCount;
+  *ModelRunner->getTensor<int64_t>(FeatureIndex::NrCtantParams) = NrCtantParams;
+  *ModelRunner->getTensor<int64_t>(FeatureIndex::EdgeCount) = EdgeCount;
+  *ModelRunner->getTensor<int64_t>(FeatureIndex::CallerUsers) =
+      CallerBefore.Uses;
+  *ModelRunner->getTensor<int64_t>(
+      FeatureIndex::CallerConditionallyExecutedBlocks) =
+      CallerBefore.BlocksReachedFromConditionalInstruction;
+  *ModelRunner->getTensor<int64_t>(FeatureIndex::CallerBasicBlockCount) =
+      CallerBefore.BasicBlockCount;
+  *ModelRunner->getTensor<int64_t>(
+      FeatureIndex::CalleeConditionallyExecutedBlocks) =
+      CalleeBefore.BlocksReachedFromConditionalInstruction;
+  *ModelRunner->getTensor<int64_t>(FeatureIndex::CalleeUsers) =
+      CalleeBefore.Uses;
+  *ModelRunner->getTensor<int64_t>(FeatureIndex::CostEstimate) = CostEstimate;
 
   // Add the cost features
   for (size_t I = 0;
        I < static_cast<size_t>(InlineCostFeatureIndex::NumberOfFeatures); ++I) {
-    ModelRunner->setFeature(
-        inlineCostFeatureToMlFeature(static_cast<InlineCostFeatureIndex>(I)),
-        CostFeatures->at(I));
+    *ModelRunner->getTensor<int64_t>(inlineCostFeatureToMlFeature(
+        static_cast<InlineCostFeatureIndex>(I))) = CostFeatures->at(I);
   }
 
   return getAdviceFromModel(CB, ORE);
@@ -276,7 +294,8 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
 std::unique_ptr<MLInlineAdvice>
 MLInlineAdvisor::getAdviceFromModel(CallBase &CB,
                                     OptimizationRemarkEmitter &ORE) {
-  return std::make_unique<MLInlineAdvice>(this, CB, ORE, ModelRunner->run());
+  return std::make_unique<MLInlineAdvice>(
+      this, CB, ORE, static_cast<bool>(ModelRunner->evaluate<int64_t>()));
 }
 
 std::unique_ptr<InlineAdvice> MLInlineAdvisor::getMandatoryAdvice(CallBase &CB,
@@ -302,7 +321,8 @@ void MLInlineAdvice::reportContextForRemark(
   using namespace ore;
   OR << NV("Callee", Callee->getName());
   for (size_t I = 0; I < NumberOfFeatures; ++I)
-    OR << NV(FeatureNameMap[I], getAdvisor()->getModelRunner().getFeature(I));
+    OR << NV(FeatureNameMap[I],
+             *getAdvisor()->getModelRunner().getTensor<int64_t>(I));
   OR << NV("ShouldInline", isInliningRecommended());
 }
 
diff --git a/llvm/lib/Analysis/MemDerefPrinter.cpp b/llvm/lib/Analysis/MemDerefPrinter.cpp
index 1b16e1a9bcb2..30937a2e4931 100644
--- a/llvm/lib/Analysis/MemDerefPrinter.cpp
+++ b/llvm/lib/Analysis/MemDerefPrinter.cpp
@@ -59,8 +59,8 @@ bool MemDerefPrinter::runOnFunction(Function &F) {
       Value *PO = LI->getPointerOperand();
       if (isDereferenceablePointer(PO, LI->getType(), DL))
         Deref.push_back(PO);
-      if (isDereferenceableAndAlignedPointer(
-              PO, LI->getType(), MaybeAlign(LI->getAlignment()), DL))
+      if (isDereferenceableAndAlignedPointer(PO, LI->getType(),
+                                             MaybeAlign(LI->getAlign()), DL))
         DerefAndAligned.insert(PO);
     }
   }
@@ -94,8 +94,8 @@ PreservedAnalyses MemDerefPrinterPass::run(Function &F,
       Value *PO = LI->getPointerOperand();
       if (isDereferenceablePointer(PO, LI->getType(), DL))
         Deref.push_back(PO);
-      if (isDereferenceableAndAlignedPointer(
-              PO, LI->getType(), MaybeAlign(LI->getAlignment()), DL))
+      if (isDereferenceableAndAlignedPointer(PO, LI->getType(),
+                                             MaybeAlign(LI->getAlign()), DL))
         DerefAndAligned.insert(PO);
     }
   }
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 4f2b5b34304d..ffdd7a2cfd4b 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -592,9 +592,9 @@ STATISTIC(ObjectVisitorArgument,
 STATISTIC(ObjectVisitorLoad,
           "Number of load instructions with unsolved size and offset");
 
-APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Alignment) {
+APInt ObjectSizeOffsetVisitor::align(APInt Size, MaybeAlign Alignment) {
   if (Options.RoundToAlign && Alignment)
-    return APInt(IntTyBits, alignTo(Size.getZExtValue(), Align(Alignment)));
+    return APInt(IntTyBits, alignTo(Size.getZExtValue(), Alignment));
   return Size;
 }
 
@@ -669,7 +669,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
 
   APInt Size(IntTyBits, DL.getTypeAllocSize(I.getAllocatedType()));
   if (!I.isArrayAllocation())
-    return std::make_pair(align(Size, I.getAlignment()), Zero);
+    return std::make_pair(align(Size, I.getAlign()), Zero);
 
   Value *ArraySize = I.getArraySize();
   if (const ConstantInt *C = dyn_cast<ConstantInt>(ArraySize)) {
@@ -679,8 +679,8 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
 
     bool Overflow;
     Size = Size.umul_ov(NumElems, Overflow);
-    return Overflow ? unknown() : std::make_pair(align(Size, I.getAlignment()),
-                                                 Zero);
+    return Overflow ? unknown()
+                    : std::make_pair(align(Size, I.getAlign()), Zero);
   }
   return unknown();
 }
@@ -694,7 +694,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
   }
 
   APInt Size(IntTyBits, DL.getTypeAllocSize(MemoryTy));
-  return std::make_pair(align(Size, A.getParamAlignment()), Zero);
+  return std::make_pair(align(Size, A.getParamAlign()), Zero);
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitCallBase(CallBase &CB) {
@@ -800,7 +800,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalVariable(GlobalVariable &GV){
     return unknown();
 
   APInt Size(IntTyBits, DL.getTypeAllocSize(GV.getValueType()));
-  return std::make_pair(align(Size, GV.getAlignment()), Zero);
+  return std::make_pair(align(Size, GV.getAlign()), Zero);
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitIntToPtrInst(IntToPtrInst&) {
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 854ba83bd34a..a877b19df866 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -101,13 +101,8 @@ MemoryLocation MemoryLocation::getForSource(const AtomicMemTransferInst *MTI) {
 }
 
 MemoryLocation MemoryLocation::getForSource(const AnyMemTransferInst *MTI) {
-  auto Size = LocationSize::afterPointer();
-  if (ConstantInt *C = dyn_cast<ConstantInt>(MTI->getLength()))
-    Size = LocationSize::precise(C->getValue().getZExtValue());
-
-  // memcpy/memmove can have AA tags. For memcpy, they apply
-  // to both the source and the destination.
-  return MemoryLocation(MTI->getRawSource(), Size, MTI->getAAMetadata());
+  assert(MTI->getRawSource() == MTI->getArgOperand(1));
+  return getForArgument(MTI, 1, nullptr);
 }
 
 MemoryLocation MemoryLocation::getForDest(const MemIntrinsic *MI) {
@@ -119,13 +114,47 @@ MemoryLocation MemoryLocation::getForDest(const AtomicMemIntrinsic *MI) {
 }
 
 MemoryLocation MemoryLocation::getForDest(const AnyMemIntrinsic *MI) {
-  auto Size = LocationSize::afterPointer();
-  if (ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength()))
-    Size = LocationSize::precise(C->getValue().getZExtValue());
+  assert(MI->getRawDest() == MI->getArgOperand(0));
+  return getForArgument(MI, 0, nullptr);
+}
+
+Optional<MemoryLocation>
+MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) {
+  if (!CB->onlyAccessesArgMemory())
+    return None;
+
+  if (CB->hasOperandBundles())
+    // TODO: remove implementation restriction
+    return None;
+
+  Value *UsedV = nullptr;
+  Optional<unsigned> UsedIdx;
+  for (unsigned i = 0; i < CB->arg_size(); i++) {
+    if (!CB->getArgOperand(i)->getType()->isPointerTy())
+      continue;
+     if (CB->onlyReadsMemory(i))
+       continue;
+    if (!UsedV) {
+      // First potentially writing parameter
+      UsedV = CB->getArgOperand(i);
+      UsedIdx = i;
+      continue;
+    }
+    UsedIdx = None;
+    if (UsedV != CB->getArgOperand(i))
+      // Can't describe writing to two distinct locations.
+      // TODO: This results in an inprecision when two values derived from the
+      // same object are passed as arguments to the same function.
+      return None;
+  }
+  if (!UsedV)
+    // We don't currently have a way to represent a "does not write" result
+    // and thus have to be conservative and return unknown.
+    return None;
 
-  // memcpy/memmove can have AA tags. For memcpy, they apply
-  // to both the source and the destination.
-  return MemoryLocation(MI->getRawDest(), Size, MI->getAAMetadata());
+  if (UsedIdx)
+    return getForArgument(CB, *UsedIdx, &TLI);
+  return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata());
 }
 
 MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
@@ -145,6 +174,9 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
     case Intrinsic::memcpy:
     case Intrinsic::memcpy_inline:
     case Intrinsic::memmove:
+    case Intrinsic::memcpy_element_unordered_atomic:
+    case Intrinsic::memmove_element_unordered_atomic:
+    case Intrinsic::memset_element_unordered_atomic:
       assert((ArgIdx == 0 || ArgIdx == 1) &&
              "Invalid argument index for memory intrinsic");
       if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2)))
@@ -204,6 +236,10 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
                                 II->getArgOperand(1)->getType())),
                             AATags);
     }
+
+    assert(
+        !isa<AnyMemTransferInst>(II) &&
+        "all memory transfer intrinsics should be handled by the switch above");
   }
 
   // We can bound the aliasing properties of memset_pattern16 just as we can
@@ -213,6 +249,12 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
   LibFunc F;
   if (TLI && TLI->getLibFunc(*Call, F) && TLI->has(F)) {
     switch (F) {
+    case LibFunc_strcpy:
+    case LibFunc_strcat:
+    case LibFunc_strncat:
+      assert((ArgIdx == 0 || ArgIdx == 1) && "Invalid argument index for str function");
+      return MemoryLocation::getAfter(Arg, AATags);
+
     case LibFunc_memset_chk: {
       assert(ArgIdx == 0 && "Invalid argument index for memset_chk");
       LocationSize Size = LocationSize::afterPointer();
@@ -236,10 +278,18 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
       return MemoryLocation(Arg, Size, AATags);
     }
     case LibFunc_memset_pattern16:
+    case LibFunc_memset_pattern4:
+    case LibFunc_memset_pattern8:
       assert((ArgIdx == 0 || ArgIdx == 1) &&
              "Invalid argument index for memset_pattern16");
-      if (ArgIdx == 1)
-        return MemoryLocation(Arg, LocationSize::precise(16), AATags);
+      if (ArgIdx == 1) {
+        unsigned Size = 16;
+        if (F == LibFunc_memset_pattern4)
+          Size = 4;
+        else if (F == LibFunc_memset_pattern8)
+          Size = 8;
+        return MemoryLocation(Arg, LocationSize::precise(Size), AATags);
+      }
       if (const ConstantInt *LenCI =
               dyn_cast<ConstantInt>(Call->getArgOperand(2)))
         return MemoryLocation(Arg, LocationSize::precise(LenCI->getZExtValue()),
@@ -274,7 +324,6 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
       break;
     };
   }
-  // FIXME: Handle memset_pattern4 and memset_pattern8 also.
 
   return MemoryLocation::getBeforeOrAfter(Call->getArgOperand(ArgIdx), AATags);
 }
diff --git a/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp b/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp
new file mode 100644
index 000000000000..941458f648bc
--- /dev/null
+++ b/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp
@@ -0,0 +1,49 @@
+//===- ModelUnderTrainingRunner.cpp - 'development' mode runner -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of a MLModelRunner for 'development' mode, i.e. evaluation
+// happens off a model that's provided from the command line and is interpreted.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#if defined(LLVM_HAVE_TF_API)
+
+#include "llvm/Analysis/ModelUnderTrainingRunner.h"
+
+using namespace llvm;
+
+ModelUnderTrainingRunner::ModelUnderTrainingRunner(
+    LLVMContext &Ctx, const std::string &ModelPath,
+    const std::vector<TensorSpec> &InputSpecs,
+    const std::vector<LoggedFeatureSpec> &OutputSpecs)
+    : MLModelRunner(Ctx), OutputSpecs(OutputSpecs) {
+  Evaluator = std::make_unique<TFModelEvaluator>(
+      ModelPath, InputSpecs, [&](size_t I) { return OutputSpecs[I].Spec; },
+      OutputSpecs.size());
+  if (!Evaluator || !Evaluator->isValid()) {
+    Ctx.emitError("Failed to create inliner saved model evaluator");
+    Evaluator.reset();
+    return;
+  }
+}
+
+void *ModelUnderTrainingRunner::evaluateUntyped() {
+  LastEvaluationResult = Evaluator->evaluate();
+  if (!LastEvaluationResult.hasValue()) {
+    Ctx.emitError("Error evaluating model.");
+    return nullptr;
+  }
+  return LastEvaluationResult->getUntypedTensorValue(0);
+}
+
+void *ModelUnderTrainingRunner::getTensorUntyped(size_t Index) {
+  return Evaluator->getUntypedInput(Index);
+}
+
+#endif // defined(LLVM_HAVE_TF_API)
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index d80814852e19..2880ca62a7f8 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -234,6 +234,18 @@ static bool isNonVolatileStore(const Instruction *I) {
   return false;
 }
 
+// Returns true if the function definition must be unreachable.
+//
+// Note if this helper function returns true, `F` is guaranteed
+// to be unreachable; if it returns false, `F` might still
+// be unreachable but not covered by this helper function.
+static bool mustBeUnreachableFunction(const Function &F) {
+  // A function must be unreachable if its entry block ends with an
+  // 'unreachable'.
+  assert(!F.isDeclaration());
+  return isa<UnreachableInst>(F.getEntryBlock().getTerminator());
+}
+
 static void computeFunctionSummary(
     ModuleSummaryIndex &Index, const Module &M, const Function &F,
     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, DominatorTree &DT,
@@ -488,7 +500,8 @@ static void computeFunctionSummary(
       // Don't try to import functions with noinline attribute.
       F.getAttributes().hasFnAttr(Attribute::NoInline),
       F.hasFnAttribute(Attribute::AlwaysInline),
-      F.hasFnAttribute(Attribute::NoUnwind), MayThrow, HasUnknownCall};
+      F.hasFnAttribute(Attribute::NoUnwind), MayThrow, HasUnknownCall,
+      mustBeUnreachableFunction(F)};
   std::vector<FunctionSummary::ParamAccess> ParamAccesses;
   if (auto *SSI = GetSSICallback(F))
     ParamAccesses = SSI->getParamAccesses(Index);
@@ -737,7 +750,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                         F->hasFnAttribute(Attribute::AlwaysInline),
                         F->hasFnAttribute(Attribute::NoUnwind),
                         /* MayThrow */ true,
-                        /* HasUnknownCall */ true},
+                        /* HasUnknownCall */ true,
+                        /* MustBeUnreachable */ false},
                     /*EntryCount=*/0, ArrayRef<ValueInfo>{},
                     ArrayRef<FunctionSummary::EdgeTy>{},
                     ArrayRef<GlobalValue::GUID>{},
diff --git a/llvm/lib/Analysis/NoInferenceModelRunner.cpp b/llvm/lib/Analysis/NoInferenceModelRunner.cpp
new file mode 100644
index 000000000000..02ece6aa3900
--- /dev/null
+++ b/llvm/lib/Analysis/NoInferenceModelRunner.cpp
@@ -0,0 +1,33 @@
+//===- NoInferenceModelRunner.cpp - noop ML model runner   ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A pseudo model runner. We use it to store feature values when collecting
+// logs for the default policy, in 'development' mode, but never ask it to
+// 'run'.
+//===----------------------------------------------------------------------===//
+#include "llvm/Config/config.h"
+#if defined(LLVM_HAVE_TF_API)
+
+#include "llvm/Analysis/NoInferenceModelRunner.h"
+#include "llvm/Analysis/Utils/TFUtils.h"
+
+using namespace llvm;
+
+NoInferenceModelRunner::NoInferenceModelRunner(
+    LLVMContext &Ctx, const std::vector<TensorSpec> &Inputs)
+    : MLModelRunner(Ctx) {
+  ValuesBuffer.reserve(Inputs.size());
+  for (const auto &TS : Inputs)
+    ValuesBuffer.push_back(std::make_unique<char[]>(TS.getElementCount() *
+                                                    TS.getElementByteSize()));
+}
+
+void *NoInferenceModelRunner::getTensorUntyped(size_t Index) {
+  return ValuesBuffer[Index].get();
+}
+#endif // defined(LLVM_HAVE_TF_API)
diff --git a/llvm/lib/Analysis/ReleaseModeModelRunner.cpp b/llvm/lib/Analysis/ReleaseModeModelRunner.cpp
deleted file mode 100644
index d2bf95388066..000000000000
--- a/llvm/lib/Analysis/ReleaseModeModelRunner.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-//===- ReleaseModeModelRunner.cpp - Fast, precompiled model runner  -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a model runner wrapping an AOT compiled ML model.
-// Only inference is supported.
-//
-//===----------------------------------------------------------------------===//
-#include "llvm/Config/config.h"
-#if defined(LLVM_HAVE_TF_AOT)
-
-#include "llvm/Analysis/InlineModelFeatureMaps.h"
-#include "llvm/Analysis/MLInlineAdvisor.h"
-
-// codegen-ed file
-#include "InlinerSizeModel.h" // NOLINT
-
-#include <memory>
-#include <vector>
-
-using namespace llvm;
-namespace {
-
-const char FeedPrefix[] = "feed_";
-const char FetchPrefix[] = "fetch_";
-
-/// MLModelRunner - production mode implementation. It uses a AOT-compiled
-/// SavedModel for efficient execution.
-class ReleaseModeModelRunner final : public MLModelRunner {
-public:
-  ReleaseModeModelRunner(LLVMContext &Ctx);
-  virtual ~ReleaseModeModelRunner() = default;
-
-  bool run() override;
-
-  void setFeature(FeatureIndex Index, int64_t Value) override;
-  int64_t getFeature(int Index) const override;
-
-private:
-  std::vector<int32_t> FeatureIndices;
-  int32_t ResultIndex = -1;
-  std::unique_ptr<llvm::InlinerSizeModel> CompiledModel;
-};
-} // namespace
-
-ReleaseModeModelRunner::ReleaseModeModelRunner(LLVMContext &Ctx)
-    : MLModelRunner(Ctx),
-      CompiledModel(std::make_unique<llvm::InlinerSizeModel>()) {
-  assert(CompiledModel && "The CompiledModel should be valid");
-
-  FeatureIndices.resize(NumberOfFeatures);
-
-  for (size_t I = 0; I < NumberOfFeatures; ++I) {
-    const int Index =
-        CompiledModel->LookupArgIndex(FeedPrefix + FeatureNameMap[I]);
-    assert(Index >= 0 && "Cannot find Feature in inlining model");
-    FeatureIndices[I] = Index;
-  }
-
-  ResultIndex =
-      CompiledModel->LookupResultIndex(std::string(FetchPrefix) + DecisionName);
-  assert(ResultIndex >= 0 && "Cannot find DecisionName in inlining model");
-}
-
-int64_t ReleaseModeModelRunner::getFeature(int Index) const {
-  return *static_cast<int64_t *>(
-      CompiledModel->arg_data(FeatureIndices[Index]));
-}
-
-void ReleaseModeModelRunner::setFeature(FeatureIndex Index, int64_t Value) {
-  *static_cast<int64_t *>(CompiledModel->arg_data(
-      FeatureIndices[static_cast<size_t>(Index)])) = Value;
-}
-
-bool ReleaseModeModelRunner::run() {
-  CompiledModel->Run();
-  return static_cast<bool>(
-      *static_cast<int64_t *>(CompiledModel->result_data(ResultIndex)));
-}
-
-std::unique_ptr<InlineAdvisor>
-llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM) {
-  auto AOTRunner = std::make_unique<ReleaseModeModelRunner>(M.getContext());
-  return std::make_unique<MLInlineAdvisor>(M, MAM, std::move(AOTRunner));
-}
-#endif // defined(LLVM_HAVE_TF_AOT)
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 7dc7f9904c70..0c3f32295ae1 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8829,11 +8829,10 @@ const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
   for (auto &LS : reverse(ValuesAtScopes[V]))
     if (LS.first == L) {
       LS.second = C;
+      if (!isa<SCEVConstant>(C))
+        ValuesAtScopesUsers[C].push_back({L, V});
       break;
     }
-
-  if (!isa<SCEVConstant>(C))
-    ValuesAtScopesUsers[C].push_back({L, V});
   return C;
 }
 
@@ -13058,11 +13057,13 @@ void ScalarEvolution::verify() const {
     Worklist.append(L->begin(), L->end());
   }
   for (auto &KV : ValueExprMap) {
+#ifndef NDEBUG
     // Check for SCEV expressions referencing invalid/deleted loops.
     if (auto *AR = dyn_cast<SCEVAddRecExpr>(KV.second)) {
       assert(ValidLoops.contains(AR->getLoop()) &&
              "AddRec references invalid loop");
     }
+#endif
 
     // Check that the value is also part of the reverse map.
     auto It = ExprValueMap.find(KV.second);
@@ -13122,7 +13123,7 @@ void ScalarEvolution::verify() const {
             is_contained(It->second, std::make_pair(L, Value)))
           continue;
         dbgs() << "Value: " << *Value << ", Loop: " << *L << ", ValueAtScope: "
-               << ValueAtScope << " missing in ValuesAtScopesUsers\n";
+               << *ValueAtScope << " missing in ValuesAtScopesUsers\n";
         std::abort();
       }
     }
@@ -13139,7 +13140,7 @@ void ScalarEvolution::verify() const {
           is_contained(It->second, std::make_pair(L, ValueAtScope)))
         continue;
       dbgs() << "Value: " << *Value << ", Loop: " << *L << ", ValueAtScope: "
-             << ValueAtScope << " missing in ValuesAtScopes\n";
+             << *ValueAtScope << " missing in ValuesAtScopes\n";
       std::abort();
     }
   }
@@ -13958,11 +13959,12 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
         ExprsToRewrite.push_back(LHS);
     }
   };
-  // Starting at the loop predecessor, climb up the predecessor chain, as long
-  // as there are predecessors that can be found that have unique successors
-  // leading to the original header.
+  // First, collect conditions from dominating branches. Starting at the loop
+  // predecessor, climb up the predecessor chain, as long as there are
+  // predecessors that can be found that have unique successors leading to the
+  // original header.
   // TODO: share this logic with isLoopEntryGuardedByCond.
-  DenseMap<const SCEV *, const SCEV *> RewriteMap;
+  SmallVector<std::pair<Value *, bool>> Terms;
   for (std::pair<const BasicBlock *, const BasicBlock *> Pair(
            L->getLoopPredecessor(), L->getHeader());
        Pair.first; Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
@@ -13972,10 +13974,20 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
     if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional())
       continue;
 
-    bool EnterIfTrue = LoopEntryPredicate->getSuccessor(0) == Pair.second;
+    Terms.emplace_back(LoopEntryPredicate->getCondition(),
+                       LoopEntryPredicate->getSuccessor(0) == Pair.second);
+  }
+
+  // Now apply the information from the collected conditions to RewriteMap.
+  // Conditions are processed in reverse order, so the earliest conditions is
+  // processed first. This ensures the SCEVs with the shortest dependency chains
+  // are constructed first.
+  DenseMap<const SCEV *, const SCEV *> RewriteMap;
+  for (auto &E : reverse(Terms)) {
+    bool EnterIfTrue = E.second;
     SmallVector<Value *, 8> Worklist;
     SmallPtrSet<Value *, 8> Visited;
-    Worklist.push_back(LoopEntryPredicate->getCondition());
+    Worklist.push_back(E.first);
     while (!Worklist.empty()) {
       Value *Cond = Worklist.pop_back_val();
       if (!Visited.insert(Cond).second)
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 72fbd5ad3f68..02923c2c7eb1 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -238,9 +238,8 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     // e.g., x86_64-pc-windows-msvc18.
     bool hasPartialC99 = true;
     if (T.isKnownWindowsMSVCEnvironment()) {
-      unsigned Major, Minor, Micro;
-      T.getEnvironmentVersion(Major, Minor, Micro);
-      hasPartialC99 = (Major == 0 || Major >= 19);
+      VersionTuple Version = T.getEnvironmentVersion();
+      hasPartialC99 = (Version.getMajor() == 0 || Version.getMajor() >= 19);
     }
 
     // Latest targets support C89 math functions, in part.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 5067f493f02d..6aa9a77391dc 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -982,10 +982,10 @@ bool TargetTransformInfo::areInlineCompatible(const Function *Caller,
   return TTIImpl->areInlineCompatible(Caller, Callee);
 }
 
-bool TargetTransformInfo::areFunctionArgsABICompatible(
+bool TargetTransformInfo::areTypesABICompatible(
     const Function *Caller, const Function *Callee,
-    SmallPtrSetImpl<Argument *> &Args) const {
-  return TTIImpl->areFunctionArgsABICompatible(Caller, Callee, Args);
+    const ArrayRef<Type *> &Types) const {
+  return TTIImpl->areTypesABICompatible(Caller, Callee, Types);
 }
 
 bool TargetTransformInfo::isIndexedLoadLegal(MemIndexedMode Mode,
@@ -1072,8 +1072,13 @@ bool TargetTransformInfo::supportsScalableVectors() const {
   return TTIImpl->supportsScalableVectors();
 }
 
-bool TargetTransformInfo::hasActiveVectorLength() const {
-  return TTIImpl->hasActiveVectorLength();
+bool TargetTransformInfo::enableScalableVectorization() const {
+  return TTIImpl->enableScalableVectorization();
+}
+
+bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
+                                                Align Alignment) const {
+  return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
 InstructionCost
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 1c41c77a8cfb..fc378f97de0b 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1154,7 +1154,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
       // If the negate has an NSW flag we can assume the sign bit of the result
       // will be 0 because that makes abs(INT_MIN) undefined.
       if (match(RHS, m_Neg(m_Specific(LHS))) &&
-          Q.IIQ.hasNoSignedWrap(cast<Instruction>(RHS)))
+          Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(RHS)))
         Known.Zero.setSignBit();
     }
 
@@ -1709,23 +1709,25 @@ static void computeKnownBitsFromOperator(const Operator *I,
             !II->getFunction()->hasFnAttribute(Attribute::VScaleRange))
           break;
 
-        auto VScaleRange = II->getFunction()
-                               ->getFnAttribute(Attribute::VScaleRange)
-                               .getVScaleRangeArgs();
+        auto Attr = II->getFunction()->getFnAttribute(Attribute::VScaleRange);
+        Optional<unsigned> VScaleMax = Attr.getVScaleRangeMax();
 
-        if (VScaleRange.second == 0)
+        if (!VScaleMax)
           break;
 
+        unsigned VScaleMin = Attr.getVScaleRangeMin();
+
         // If vscale min = max then we know the exact value at compile time
         // and hence we know the exact bits.
-        if (VScaleRange.first == VScaleRange.second) {
-          Known.One = VScaleRange.first;
-          Known.Zero = VScaleRange.first;
+        if (VScaleMin == VScaleMax) {
+          Known.One = VScaleMin;
+          Known.Zero = VScaleMin;
           Known.Zero.flipAllBits();
           break;
         }
 
-        unsigned FirstZeroHighBit = 32 - countLeadingZeros(VScaleRange.second);
+        unsigned FirstZeroHighBit =
+            32 - countLeadingZeros(VScaleMax.getValue());
         if (FirstZeroHighBit < BitWidth)
           Known.Zero.setBitsFrom(FirstZeroHighBit);
 
@@ -4676,8 +4678,8 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
       return false;
     const DataLayout &DL = LI->getModule()->getDataLayout();
     return isDereferenceableAndAlignedPointer(
-        LI->getPointerOperand(), LI->getType(), MaybeAlign(LI->getAlignment()),
-        DL, CtxI, DT, TLI);
+        LI->getPointerOperand(), LI->getType(), MaybeAlign(LI->getAlign()), DL,
+        CtxI, DT, TLI);
   }
   case Instruction::Call: {
     auto *CI = cast<const CallInst>(Inst);
@@ -4975,14 +4977,6 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
   if (ConsiderFlags && Op->hasPoisonGeneratingFlags())
     return true;
 
-  // TODO: this should really be under the ConsiderFlags block, but currently
-  // these are not dropped by dropPoisonGeneratingFlags
-  if (const auto *FP = dyn_cast<FPMathOperator>(Op)) {
-    auto FMF = FP->getFastMathFlags();
-    if (FMF.noNaNs() || FMF.noInfs())
-      return true;
-  }
-
   unsigned Opcode = Op->getOpcode();
 
   // Check whether opcode is a poison/undef-generating operation
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 41fb0b9008be..e3bf41c9721b 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -733,6 +733,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(x);
   KEYWORD(blockaddress);
   KEYWORD(dso_local_equivalent);
+  KEYWORD(no_cfi);
 
   // Metadata types.
   KEYWORD(distinct);
@@ -773,6 +774,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(noUnwind);
   KEYWORD(mayThrow);
   KEYWORD(hasUnknownCall);
+  KEYWORD(mustBeUnreachable);
   KEYWORD(calls);
   KEYWORD(callee);
   KEYWORD(params);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 5feabd876e3a..35c615522fe2 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -152,28 +152,28 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
         FnAttrs.removeAttribute(Attribute::Alignment);
       }
 
-      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, FnAttrs);
       Fn->setAttributes(AS);
     } else if (CallInst *CI = dyn_cast<CallInst>(V)) {
       AttributeList AS = CI->getAttributes();
       AttrBuilder FnAttrs(AS.getFnAttrs());
       AS = AS.removeFnAttributes(Context);
       FnAttrs.merge(B);
-      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, FnAttrs);
       CI->setAttributes(AS);
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(V)) {
       AttributeList AS = II->getAttributes();
       AttrBuilder FnAttrs(AS.getFnAttrs());
       AS = AS.removeFnAttributes(Context);
       FnAttrs.merge(B);
-      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, FnAttrs);
       II->setAttributes(AS);
     } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(V)) {
       AttributeList AS = CBI->getAttributes();
       AttrBuilder FnAttrs(AS.getFnAttrs());
       AS = AS.removeFnAttributes(Context);
       FnAttrs.merge(B);
-      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, FnAttrs);
       CBI->setAttributes(AS);
     } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
       AttrBuilder Attrs(GV->getAttributes());
@@ -1306,7 +1306,8 @@ bool LLParser::parseEnumAttribute(Attribute::AttrKind Attr, AttrBuilder &B,
     unsigned MinValue, MaxValue;
     if (parseVScaleRangeArguments(MinValue, MaxValue))
       return true;
-    B.addVScaleRangeAttr(MinValue, MaxValue);
+    B.addVScaleRangeAttr(MinValue,
+                         MaxValue > 0 ? MaxValue : Optional<unsigned>());
     return false;
   }
   case Attribute::Dereferenceable: {
@@ -3287,6 +3288,20 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
     return false;
   }
 
+  case lltok::kw_no_cfi: {
+    // ValID ::= 'no_cfi' @foo
+    Lex.Lex();
+
+    if (parseValID(ID, PFS))
+      return true;
+
+    if (ID.Kind != ValID::t_GlobalID && ID.Kind != ValID::t_GlobalName)
+      return error(ID.Loc, "expected global value name in no_cfi");
+
+    ID.NoCFI = true;
+    return false;
+  }
+
   case lltok::kw_trunc:
   case lltok::kw_zext:
   case lltok::kw_sext:
@@ -5267,9 +5282,13 @@ bool LLParser::convertValIDToValue(Type *Ty, ValID &ID, Value *&V,
   }
   case ValID::t_GlobalName:
     V = getGlobalVal(ID.StrVal, Ty, ID.Loc);
+    if (V && ID.NoCFI)
+      V = NoCFIValue::get(cast<GlobalValue>(V));
     return V == nullptr;
   case ValID::t_GlobalID:
     V = getGlobalVal(ID.UIntVal, Ty, ID.Loc);
+    if (V && ID.NoCFI)
+      V = NoCFIValue::get(cast<GlobalValue>(V));
     return V == nullptr;
   case ValID::t_APSInt:
     if (!Ty->isIntegerTy())
@@ -8533,6 +8552,7 @@ bool LLParser::parseFlag(unsigned &Val) {
 ///        [',' 'noUnwind' ':' Flag]? ')'
 ///        [',' 'mayThrow' ':' Flag]? ')'
 ///        [',' 'hasUnknownCall' ':' Flag]? ')'
+///        [',' 'mustBeUnreachable' ':' Flag]? ')'
 
 bool LLParser::parseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
   assert(Lex.getKind() == lltok::kw_funcFlags);
@@ -8599,6 +8619,12 @@ bool LLParser::parseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
         return true;
       FFlags.HasUnknownCall = Val;
       break;
+    case lltok::kw_mustBeUnreachable:
+      Lex.Lex();
+      if (parseToken(lltok::colon, "expected ':'") || parseFlag(Val))
+        return true;
+      FFlags.MustBeUnreachable = Val;
+      break;
     default:
       return error(Lex.getLoc(), "expected function flag type");
     }
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index cd1d872cc219..284e469a1d2f 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -57,11 +57,7 @@ bool MetadataVerifier::verifyArray(
   auto &Array = Node.getArray();
   if (Size && Array.size() != *Size)
     return false;
-  for (auto &Item : Array)
-    if (!verifyNode(Item))
-      return false;
-
-  return true;
+  return llvm::all_of(Array, verifyNode);
 }
 
 bool MetadataVerifier::verifyEntry(
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index d7bcb0d7f575..a36b256c29b6 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -107,9 +107,9 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID,
   // Check to see if we have a blockinfo record for this record, with a name.
   if (const BitstreamBlockInfo::BlockInfo *Info =
           BlockInfo.getBlockInfo(BlockID)) {
-    for (unsigned i = 0, e = Info->RecordNames.size(); i != e; ++i)
-      if (Info->RecordNames[i].first == CodeID)
-        return Info->RecordNames[i].second.c_str();
+    for (const std::pair<unsigned, std::string> &RN : Info->RecordNames)
+      if (RN.first == CodeID)
+        return RN.second.c_str();
   }
 
   if (CurStreamType != LLVMIRBitstream)
@@ -219,6 +219,7 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(CST_CODE, CE_SHUFVEC_EX)
       STRINGIFY_CODE(CST_CODE, CE_UNOP)
       STRINGIFY_CODE(CST_CODE, DSO_LOCAL_EQUIVALENT)
+      STRINGIFY_CODE(CST_CODE, NO_CFI_VALUE)
     case bitc::CST_CODE_BLOCKADDRESS:
       return "CST_CODE_BLOCKADDRESS";
       STRINGIFY_CODE(CST_CODE, DATA)
@@ -646,16 +647,14 @@ void BitcodeAnalyzer::printStats(BCDumpOptions O,
 
   // Emit per-block stats.
   O.OS << "Per-block Summary:\n";
-  for (std::map<unsigned, PerBlockIDStats>::iterator I = BlockIDStats.begin(),
-                                                     E = BlockIDStats.end();
-       I != E; ++I) {
-    O.OS << "  Block ID #" << I->first;
+  for (const auto &Stat : BlockIDStats) {
+    O.OS << "  Block ID #" << Stat.first;
     if (Optional<const char *> BlockName =
-            GetBlockName(I->first, BlockInfo, CurStreamType))
+            GetBlockName(Stat.first, BlockInfo, CurStreamType))
       O.OS << " (" << *BlockName << ")";
     O.OS << ":\n";
 
-    const PerBlockIDStats &Stats = I->second;
+    const PerBlockIDStats &Stats = Stat.second;
     O.OS << "      Num Instances: " << Stats.NumInstances << "\n";
     O.OS << "         Total Size: ";
     printSize(O.OS, Stats.NumBits);
@@ -694,8 +693,8 @@ void BitcodeAnalyzer::printStats(BCDumpOptions O,
 
       O.OS << "\tRecord Histogram:\n";
       O.OS << "\t\t  Count    # Bits     b/Rec   % Abv  Record Kind\n";
-      for (unsigned i = 0, e = FreqPairs.size(); i != e; ++i) {
-        const PerRecordStats &RecStats = Stats.CodeFreq[FreqPairs[i].second];
+      for (const auto &FreqPair : FreqPairs) {
+        const PerRecordStats &RecStats = Stats.CodeFreq[FreqPair.second];
 
         O.OS << format("\t\t%7d %9lu", RecStats.NumInstances,
                        (unsigned long)RecStats.TotalBits);
@@ -714,10 +713,10 @@ void BitcodeAnalyzer::printStats(BCDumpOptions O,
 
         O.OS << "  ";
         if (Optional<const char *> CodeName = GetCodeName(
-                FreqPairs[i].second, I->first, BlockInfo, CurStreamType))
+                FreqPair.second, Stat.first, BlockInfo, CurStreamType))
           O.OS << *CodeName << "\n";
         else
-          O.OS << "UnknownCode" << FreqPairs[i].second << "\n";
+          O.OS << "UnknownCode" << FreqPair.second << "\n";
       }
       O.OS << "\n";
     }
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 993cb1de8c02..f5a878f8788a 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -488,6 +488,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   BitcodeReaderValueList ValueList;
   Optional<MetadataLoader> MDLoader;
   std::vector<Comdat *> ComdatList;
+  DenseSet<GlobalObject *> ImplicitComdatObjects;
   SmallVector<Instruction *, 64> InstructionList;
 
   std::vector<std::pair<GlobalVariable *, unsigned>> GlobalInits;
@@ -932,6 +933,7 @@ static FunctionSummary::FFlags getDecodedFFlags(uint64_t RawFlags) {
   Flags.NoUnwind = (RawFlags >> 6) & 0x1;
   Flags.MayThrow = (RawFlags >> 7) & 0x1;
   Flags.HasUnknownCall = (RawFlags >> 8) & 0x1;
+  Flags.MustBeUnreachable = (RawFlags >> 9) & 0x1;
   return Flags;
 }
 
@@ -2037,14 +2039,8 @@ Expected<Value *> BitcodeReader::recordValue(SmallVectorImpl<uint64_t> &Record,
     return error("Invalid value name");
   V->setName(NameStr);
   auto *GO = dyn_cast<GlobalObject>(V);
-  if (GO) {
-    if (GO->getComdat() == reinterpret_cast<Comdat *>(1)) {
-      if (TT.supportsCOMDAT())
-        GO->setComdat(TheModule->getOrInsertComdat(V->getName()));
-      else
-        GO->setComdat(nullptr);
-    }
-  }
+  if (GO && ImplicitComdatObjects.contains(GO) && TT.supportsCOMDAT())
+    GO->setComdat(TheModule->getOrInsertComdat(V->getName()));
   return V;
 }
 
@@ -2942,6 +2938,19 @@ Error BitcodeReader::parseConstants() {
       V = DSOLocalEquivalent::get(GV);
       break;
     }
+    case bitc::CST_CODE_NO_CFI_VALUE: {
+      if (Record.size() < 2)
+        return error("Invalid record");
+      Type *GVTy = getTypeByID(Record[0]);
+      if (!GVTy)
+        return error("Invalid record");
+      GlobalValue *GV = dyn_cast_or_null<GlobalValue>(
+          ValueList.getConstantFwdRef(Record[1], GVTy));
+      if (!GV)
+        return error("Invalid record");
+      V = NoCFIValue::get(GV);
+      break;
+    }
     }
 
     ValueList.assignValue(V, NextCstNo);
@@ -3292,7 +3301,7 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
       NewGV->setComdat(ComdatList[ComdatID - 1]);
     }
   } else if (hasImplicitComdat(RawLinkage)) {
-    NewGV->setComdat(reinterpret_cast<Comdat *>(1));
+    ImplicitComdatObjects.insert(NewGV);
   }
 
   if (Record.size() > 12) {
@@ -3426,7 +3435,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
       Func->setComdat(ComdatList[ComdatID - 1]);
     }
   } else if (hasImplicitComdat(RawLinkage)) {
-    Func->setComdat(reinterpret_cast<Comdat *>(1));
+    ImplicitComdatObjects.insert(Func);
   }
 
   if (Record.size() > 13)
@@ -6733,10 +6742,10 @@ llvm::getBitcodeFileContents(MemoryBufferRef Buffer) {
         // not have its own string table. A bitcode file may have multiple
         // string tables if it was created by binary concatenation, for example
         // with "llvm-cat -b".
-        for (auto I = F.Mods.rbegin(), E = F.Mods.rend(); I != E; ++I) {
-          if (!I->Strtab.empty())
+        for (BitcodeModule &I : llvm::reverse(F.Mods)) {
+          if (!I.Strtab.empty())
             break;
-          I->Strtab = *Strtab;
+          I.Strtab = *Strtab;
         }
         // Similarly, the string table is used by every preceding symbol table;
         // normally there will be just one unless the bitcode file was created
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index e2354c40844a..dc06bc10cf95 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -833,8 +833,7 @@ void ModuleBitcodeWriter::writeAttributeTable() {
   Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3);
 
   SmallVector<uint64_t, 64> Record;
-  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    AttributeList AL = Attrs[i];
+  for (const AttributeList &AL : Attrs) {
     for (unsigned i : AL.indexes()) {
       AttributeSet AS = AL.getAttributes(i);
       if (AS.hasAttributes())
@@ -1067,6 +1066,7 @@ static uint64_t getEncodedFFlags(FunctionSummary::FFlags Flags) {
   RawFlags |= (Flags.NoUnwind << 6);
   RawFlags |= (Flags.MayThrow << 7);
   RawFlags |= (Flags.HasUnknownCall << 8);
+  RawFlags |= (Flags.MustBeUnreachable << 9);
   return RawFlags;
 }
 
@@ -2657,6 +2657,10 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
       Code = bitc::CST_CODE_DSO_LOCAL_EQUIVALENT;
       Record.push_back(VE.getTypeID(Equiv->getGlobalValue()->getType()));
       Record.push_back(VE.getValueID(Equiv->getGlobalValue()));
+    } else if (const auto *NC = dyn_cast<NoCFIValue>(C)) {
+      Code = bitc::CST_CODE_NO_CFI_VALUE;
+      Record.push_back(VE.getTypeID(NC->getGlobalValue()->getType()));
+      Record.push_back(VE.getValueID(NC->getGlobalValue()));
     } else {
 #ifndef NDEBUG
       C->dump();
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index 07e0708e68c3..df4f1a1873d7 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -310,8 +310,7 @@ static UseListOrderStack predictUseListOrder(const Module &M) {
   // We want to visit the functions backward now so we can list function-local
   // constants in the last Function they're used in.  Module-level constants
   // have already been visited above.
-  for (auto I = M.rbegin(), E = M.rend(); I != E; ++I) {
-    const Function &F = *I;
+  for (const Function &F : llvm::reverse(M)) {
     if (F.isDeclaration())
       continue;
     for (const BasicBlock &BB : F)
@@ -541,9 +540,8 @@ void ValueEnumerator::print(raw_ostream &OS, const ValueMapType &Map,
                             const char *Name) const {
   OS << "Map Name: " << Name << "\n";
   OS << "Size: " << Map.size() << "\n";
-  for (ValueMapType::const_iterator I = Map.begin(),
-         E = Map.end(); I != E; ++I) {
-    const Value *V = I->first;
+  for (const auto &I : Map) {
+    const Value *V = I.first;
     if (V->hasName())
       OS << "Value: " << V->getName();
     else
@@ -569,10 +567,10 @@ void ValueEnumerator::print(raw_ostream &OS, const MetadataMapType &Map,
                             const char *Name) const {
   OS << "Map Name: " << Name << "\n";
   OS << "Size: " << Map.size() << "\n";
-  for (auto I = Map.begin(), E = Map.end(); I != E; ++I) {
-    const Metadata *MD = I->first;
-    OS << "Metadata: slot = " << I->second.ID << "\n";
-    OS << "Metadata: function = " << I->second.F << "\n";
+  for (const auto &I : Map) {
+    const Metadata *MD = I.first;
+    OS << "Metadata: slot = " << I.second.ID << "\n";
+    OS << "Metadata: function = " << I.second.F << "\n";
     MD->print(OS);
     OS << "\n";
   }
diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 5984063627b0..5c64622c7245 100644
--- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -561,8 +561,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
                     << ":\n");
   std::map<unsigned, BitVector> RenameRegisterMap;
   unsigned SuperReg = 0;
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    unsigned Reg = Regs[i];
+  for (unsigned Reg : Regs) {
     if ((SuperReg == 0) || TRI->isSuperRegister(SuperReg, Reg))
       SuperReg = Reg;
 
@@ -584,8 +583,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
   }
 
   // All group registers should be a subreg of SuperReg.
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    unsigned Reg = Regs[i];
+  for (unsigned Reg : Regs) {
     if (Reg == SuperReg) continue;
     bool IsSub = TRI->isSubRegister(SuperReg, Reg);
     // FIXME: remove this once PR18663 has been properly fixed. For now,
@@ -646,8 +644,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
     // For each referenced group register (which must be a SuperReg or
     // a subregister of SuperReg), find the corresponding subregister
     // of NewSuperReg and make sure it is free to be renamed.
-    for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-      unsigned Reg = Regs[i];
+    for (unsigned Reg : Regs) {
       unsigned NewReg = 0;
       if (Reg == SuperReg) {
         NewReg = NewSuperReg;
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index 7d8a73e12d3a..7e68e5e22879 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -712,8 +712,8 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
     // The manipulations performed when we're looking through an insertvalue or
     // an extractvalue would happen at the front of the RetPath list, so since
     // we have to copy it anyway it's more efficient to create a reversed copy.
-    SmallVector<unsigned, 4> TmpRetPath(RetPath.rbegin(), RetPath.rend());
-    SmallVector<unsigned, 4> TmpCallPath(CallPath.rbegin(), CallPath.rend());
+    SmallVector<unsigned, 4> TmpRetPath(llvm::reverse(RetPath));
+    SmallVector<unsigned, 4> TmpCallPath(llvm::reverse(CallPath));
 
     // Finally, we can check whether the value produced by the tail call at this
     // index is compatible with the value we return.
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 828cb760b82e..533f20535655 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -180,7 +180,7 @@ Align AsmPrinter::getGVAlignment(const GlobalObject *GV, const DataLayout &DL,
     Alignment = InAlign;
 
   // If the GV has a specified alignment, take it into account.
-  const MaybeAlign GVAlign(GV->getAlignment());
+  const MaybeAlign GVAlign(GV->getAlign());
   if (!GVAlign)
     return Alignment;
 
@@ -288,7 +288,11 @@ bool AsmPrinter::doInitialization(Module &M) {
   // use the directive, where it would need the same conditionalization
   // anyway.
   const Triple &Target = TM.getTargetTriple();
-  OutStreamer->emitVersionForTarget(Target, M.getSDKVersion());
+  Triple TVT(M.getDarwinTargetVariantTriple());
+  OutStreamer->emitVersionForTarget(
+      Target, M.getSDKVersion(),
+      M.getDarwinTargetVariantTriple().empty() ? nullptr : &TVT,
+      M.getDarwinTargetVariantSDKVersion());
 
   // Allow the target to emit any magic that it wants at the start of the file.
   emitStartOfAsmFile(M);
@@ -1856,6 +1860,17 @@ bool AsmPrinter::doFinalization(Module &M) {
         continue;
       OutStreamer->emitSymbolAttribute(getSymbol(&GO), MCSA_WeakReference);
     }
+    if (shouldEmitWeakSwiftAsyncExtendedFramePointerFlags()) {
+      auto SymbolName = "swift_async_extendedFramePointerFlags";
+      auto Global = M.getGlobalVariable(SymbolName);
+      if (!Global) {
+        auto Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+        Global = new GlobalVariable(M, Int8PtrTy, false,
+                                    GlobalValue::ExternalWeakLinkage, nullptr,
+                                    SymbolName);
+        OutStreamer->emitSymbolAttribute(getSymbol(Global), MCSA_WeakReference);
+      }
+    }
   }
 
   // Print aliases in topological order, that is, for each alias a = b,
@@ -2502,6 +2517,9 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
   if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(CV))
     return getObjFileLowering().lowerDSOLocalEquivalent(Equiv, TM);
 
+  if (const NoCFIValue *NC = dyn_cast<NoCFIValue>(CV))
+    return MCSymbolRefExpr::create(getSymbol(NC->getGlobalValue()), Ctx);
+
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
   if (!CE) {
     llvm_unreachable("Unknown constant value to lower!");
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 85ff84484ced..d621108408f0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -611,8 +611,8 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
 void CodeViewDebug::beginModule(Module *M) {
   // If module doesn't have named metadata anchors or COFF debug section
   // is not available, skip any debug info related stuff.
-  if (!M->getNamedMetadata("llvm.dbg.cu") ||
-      !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
+  NamedMDNode *CUs = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CUs || !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
     Asm = nullptr;
     return;
   }
@@ -622,7 +622,6 @@ void CodeViewDebug::beginModule(Module *M) {
   TheCPU = mapArchToCVCPUType(Triple(M->getTargetTriple()).getArch());
 
   // Get the current source language.
-  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
   const MDNode *Node = *CUs->operands().begin();
   const auto *CU = cast<DICompileUnit>(Node);
 
@@ -650,6 +649,7 @@ void CodeViewDebug::endModule() {
   switchToDebugSectionForSymbol(nullptr);
 
   MCSymbol *CompilerInfo = beginCVSubsection(DebugSubsectionKind::Symbols);
+  emitObjName();
   emitCompilerInformation();
   endCVSubsection(CompilerInfo);
 
@@ -785,6 +785,29 @@ void CodeViewDebug::emitTypeGlobalHashes() {
   }
 }
 
+void CodeViewDebug::emitObjName() {
+  MCSymbol *CompilerEnd = beginSymbolRecord(SymbolKind::S_OBJNAME);
+
+  StringRef PathRef(Asm->TM.Options.ObjectFilenameForDebug);
+  llvm::SmallString<256> PathStore(PathRef);
+
+  if (PathRef.empty() || PathRef == "-") {
+    // Don't emit the filename if we're writing to stdout or to /dev/null.
+    PathRef = {};
+  } else {
+    llvm::sys::path::remove_dots(PathStore, /*remove_dot_dot=*/true);
+    PathRef = PathStore;
+  }
+
+  OS.AddComment("Signature");
+  OS.emitIntValue(0, 4);
+
+  OS.AddComment("Object name");
+  emitNullTerminatedSymbolName(OS, PathRef);
+
+  endSymbolRecord(CompilerEnd);
+}
+
 namespace {
 struct Version {
   int Part[4];
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index 6f88e15ee8fe..d1fc3cdccb20 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -302,6 +302,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
   void emitTypeGlobalHashes();
 
+  void emitObjName();
+
   void emitCompilerInformation();
 
   void emitBuildInfo();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 0d2736178f0f..9b73f0ab2f05 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -779,7 +779,7 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
     const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
 
     auto AddEntry = [&](const DbgValueLocEntry &Entry,
-                                            DIExpressionCursor &Cursor) {
+                        DIExpressionCursor &Cursor) {
       if (Entry.isLocation()) {
         if (!DwarfExpr.addMachineRegExpression(TRI, Cursor,
                                                Entry.getLoc().getReg()))
@@ -788,11 +788,19 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
         // If there is an expression, emit raw unsigned bytes.
         DwarfExpr.addUnsignedConstant(Entry.getInt());
       } else if (Entry.isConstantFP()) {
+        // DwarfExpression does not support arguments wider than 64 bits
+        // (see PR52584).
+        // TODO: Consider chunking expressions containing overly wide
+        // arguments into separate pointer-sized fragment expressions.
         APInt RawBytes = Entry.getConstantFP()->getValueAPF().bitcastToAPInt();
-        DwarfExpr.addUnsignedConstant(RawBytes);
+        if (RawBytes.getBitWidth() > 64)
+          return false;
+        DwarfExpr.addUnsignedConstant(RawBytes.getZExtValue());
       } else if (Entry.isConstantInt()) {
         APInt RawBytes = Entry.getConstantInt()->getValue();
-        DwarfExpr.addUnsignedConstant(RawBytes);
+        if (RawBytes.getBitWidth() > 64)
+          return false;
+        DwarfExpr.addUnsignedConstant(RawBytes.getZExtValue());
       } else if (Entry.isTargetIndexLocation()) {
         TargetIndexLocation Loc = Entry.getTargetIndexLocation();
         // TODO TargetIndexLocation is a target-independent. Currently only the
@@ -805,11 +813,12 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
       return true;
     };
 
-    DwarfExpr.addExpression(
-        std::move(Cursor),
-        [&](unsigned Idx, DIExpressionCursor &Cursor) -> bool {
-          return AddEntry(DVal->getLocEntries()[Idx], Cursor);
-        });
+    if (!DwarfExpr.addExpression(
+            std::move(Cursor),
+            [&](unsigned Idx, DIExpressionCursor &Cursor) -> bool {
+              return AddEntry(DVal->getLocEntries()[Idx], Cursor);
+            }))
+      return VariableDie;
 
     // Now attach the location information to the DIE.
     addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 047676d4c11e..48134f1fd774 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1224,17 +1224,15 @@ void DwarfDebug::beginModule(Module *M) {
         CU.getOrCreateGlobalVariableDIE(GV, sortGlobalExprs(GVMap[GV]));
     }
 
-    for (auto *Ty : CUNode->getEnumTypes()) {
-      // The enum types array by design contains pointers to
-      // MDNodes rather than DIRefs. Unique them here.
+    for (auto *Ty : CUNode->getEnumTypes())
       CU.getOrCreateTypeDIE(cast<DIType>(Ty));
-    }
+
     for (auto *Ty : CUNode->getRetainedTypes()) {
       // The retained types array by design contains pointers to
       // MDNodes rather than DIRefs. Unique them here.
       if (DIType *RT = dyn_cast<DIType>(Ty))
-          // There is no point in force-emitting a forward declaration.
-          CU.getOrCreateTypeDIE(RT);
+        // There is no point in force-emitting a forward declaration.
+        CU.getOrCreateTypeDIE(RT);
     }
     // Emit imported_modules last so that the relevant context is already
     // available.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 6409c39e7849..37407c98e75f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -463,15 +463,14 @@ static bool isMemoryLocation(DIExpressionCursor ExprCursor) {
   return true;
 }
 
-void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
-                                    unsigned FragmentOffsetInBits) {
+void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor) {
   addExpression(std::move(ExprCursor),
                 [](unsigned Idx, DIExpressionCursor &Cursor) -> bool {
                   llvm_unreachable("unhandled opcode found in expression");
                 });
 }
 
-void DwarfExpression::addExpression(
+bool DwarfExpression::addExpression(
     DIExpressionCursor &&ExprCursor,
     llvm::function_ref<bool(unsigned, DIExpressionCursor &)> InsertArg) {
   // Entry values can currently only cover the initial register location,
@@ -496,7 +495,7 @@ void DwarfExpression::addExpression(
     case dwarf::DW_OP_LLVM_arg:
       if (!InsertArg(Op->getArg(0), ExprCursor)) {
         LocationKind = Unknown;
-        return;
+        return false;
       }
       break;
     case dwarf::DW_OP_LLVM_fragment: {
@@ -527,7 +526,7 @@ void DwarfExpression::addExpression(
       setSubRegisterPiece(0, 0);
       // Reset the location description kind.
       LocationKind = Unknown;
-      return;
+      return true;
     }
     case dwarf::DW_OP_plus_uconst:
       assert(!isRegisterLocation());
@@ -630,6 +629,8 @@ void DwarfExpression::addExpression(
   if (isImplicitLocation() && !isParameterValue())
     // Turn this into an implicit location description.
     addStackValue();
+
+  return true;
 }
 
 /// add masking operations to stencil out a subregister.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 513e9072309e..e605fe2f7d39 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -340,16 +340,17 @@ public:
   /// create one if necessary.
   unsigned getOrCreateBaseType(unsigned BitSize, dwarf::TypeKind Encoding);
 
+  /// Emit all remaining operations in the DIExpressionCursor. The
+  /// cursor must not contain any DW_OP_LLVM_arg operations.
+  void addExpression(DIExpressionCursor &&Expr);
+
   /// Emit all remaining operations in the DIExpressionCursor.
-  ///
-  /// \param FragmentOffsetInBits     If this is one fragment out of multiple
-  ///                                 locations, this is the offset of the
-  ///                                 fragment inside the entire variable.
-  void addExpression(DIExpressionCursor &&Expr,
-                     unsigned FragmentOffsetInBits = 0);
-  void
-  addExpression(DIExpressionCursor &&Expr,
-                llvm::function_ref<bool(unsigned, DIExpressionCursor &)> InsertArg);
+  /// DW_OP_LLVM_arg operations are resolved by calling (\p InsertArg).
+  //
+  /// \return false if any call to (\p InsertArg) returns false.
+  bool addExpression(
+      DIExpressionCursor &&Expr,
+      llvm::function_ref<bool(unsigned, DIExpressionCursor &)> InsertArg);
 
   /// If applicable, emit an empty DW_OP_piece / DW_OP_bit_piece to advance to
   /// the fragment described by \c Expr.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 976e35905144..6b6d63f14f87 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -536,6 +536,18 @@ void DwarfUnit::addThrownTypes(DIE &Die, DINodeArray ThrownTypes) {
   }
 }
 
+void DwarfUnit::addAccess(DIE &Die, DINode::DIFlags Flags) {
+  if ((Flags & DINode::FlagAccessibility) == DINode::FlagProtected)
+    addUInt(Die, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_protected);
+  else if ((Flags & DINode::FlagAccessibility) == DINode::FlagPrivate)
+    addUInt(Die, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_private);
+  else if ((Flags & DINode::FlagAccessibility) == DINode::FlagPublic)
+    addUInt(Die, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_public);
+}
+
 DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
   if (!Context || isa<DIFile>(Context))
     return &getUnitDie();
@@ -842,13 +854,17 @@ void DwarfUnit::addAnnotation(DIE &Buffer, DINodeArray Annotations) {
   for (const Metadata *Annotation : Annotations->operands()) {
     const MDNode *MD = cast<MDNode>(Annotation);
     const MDString *Name = cast<MDString>(MD->getOperand(0));
-
-    // Currently, only MDString is supported with btf_decl_tag attribute.
-    const MDString *Value = cast<MDString>(MD->getOperand(1));
+    const auto &Value = MD->getOperand(1);
 
     DIE &AnnotationDie = createAndAddDIE(dwarf::DW_TAG_LLVM_annotation, Buffer);
     addString(AnnotationDie, dwarf::DW_AT_name, Name->getString());
-    addString(AnnotationDie, dwarf::DW_AT_const_value, Value->getString());
+    if (const auto *Data = dyn_cast<MDString>(Value))
+      addString(AnnotationDie, dwarf::DW_AT_const_value, Data->getString());
+    else if (const auto *Data = dyn_cast<ConstantAsMetadata>(Value))
+      addConstantValue(AnnotationDie, Data->getValue()->getUniqueInteger(),
+                       /*Unsigned=*/true);
+    else
+      assert(false && "Unsupported annotation value type");
   }
 }
 
@@ -1007,6 +1023,9 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
     if (CTy->isForwardDecl())
       addFlag(Buffer, dwarf::DW_AT_declaration);
 
+    // Add accessibility info if available.
+    addAccess(Buffer, CTy->getFlags());
+
     // Add source line info if available.
     if (!CTy->isForwardDecl())
       addSourceLine(Buffer, CTy);
@@ -1308,15 +1327,7 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
   if (SP->isNoReturn())
     addFlag(SPDie, dwarf::DW_AT_noreturn);
 
-  if (SP->isProtected())
-    addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_protected);
-  else if (SP->isPrivate())
-    addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_private);
-  else if (SP->isPublic())
-    addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_public);
+  addAccess(SPDie, SP->getFlags());
 
   if (SP->isExplicit())
     addFlag(SPDie, dwarf::DW_AT_explicit);
@@ -1666,16 +1677,8 @@ DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
     }
   }
 
-  if (DT->isProtected())
-    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_protected);
-  else if (DT->isPrivate())
-    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_private);
-  // Otherwise C++ member and base classes are considered public.
-  else if (DT->isPublic())
-    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_public);
+  addAccess(MemberDie, DT->getFlags());
+
   if (DT->isVirtual())
     addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1,
             dwarf::DW_VIRTUALITY_virtual);
@@ -1717,15 +1720,7 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) {
 
   // FIXME: We could omit private if the parent is a class_type, and
   // public if the parent is something else.
-  if (DT->isProtected())
-    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_protected);
-  else if (DT->isPrivate())
-    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_private);
-  else if (DT->isPublic())
-    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_public);
+  addAccess(StaticMemberDIE, DT->getFlags());
 
   if (const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DT->getConstant()))
     addConstantValue(StaticMemberDIE, CI, Ty);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 8140279adaef..54b0079dd7ce 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -226,6 +226,9 @@ public:
   /// Add thrown types.
   void addThrownTypes(DIE &Die, DINodeArray ThrownTypes);
 
+  /// Add the accessibility attribute.
+  void addAccess(DIE &Die, DINode::DIFlags Flags);
+
   /// Add a new type attribute to the specified entity.
   ///
   /// This takes and attribute parameter because DW_AT_friend attributes are
diff --git a/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index a9fb31d42679..3ade262d9af2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -112,16 +112,12 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
   EmitCamlGlobal(M, AP, "frametable");
 
   int NumDescriptors = 0;
-  for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
-                                           IE = Info.funcinfo_end();
-       I != IE; ++I) {
-    GCFunctionInfo &FI = **I;
-    if (FI.getStrategy().getName() != getStrategy().getName())
+  for (std::unique_ptr<GCFunctionInfo> &FI :
+       llvm::make_range(Info.funcinfo_begin(), Info.funcinfo_end())) {
+    if (FI->getStrategy().getName() != getStrategy().getName())
       // this function is managed by some other GC
       continue;
-    for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
-      NumDescriptors++;
-    }
+    NumDescriptors += FI->size();
   }
 
   if (NumDescriptors >= 1 << 16) {
@@ -131,35 +127,34 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
   AP.emitInt16(NumDescriptors);
   AP.emitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
 
-  for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
-                                           IE = Info.funcinfo_end();
-       I != IE; ++I) {
-    GCFunctionInfo &FI = **I;
-    if (FI.getStrategy().getName() != getStrategy().getName())
+  for (std::unique_ptr<GCFunctionInfo> &FI :
+       llvm::make_range(Info.funcinfo_begin(), Info.funcinfo_end())) {
+    if (FI->getStrategy().getName() != getStrategy().getName())
       // this function is managed by some other GC
       continue;
 
-    uint64_t FrameSize = FI.getFrameSize();
+    uint64_t FrameSize = FI->getFrameSize();
     if (FrameSize >= 1 << 16) {
       // Very rude!
-      report_fatal_error("Function '" + FI.getFunction().getName() +
+      report_fatal_error("Function '" + FI->getFunction().getName() +
                          "' is too large for the ocaml GC! "
                          "Frame size " +
                          Twine(FrameSize) +
                          ">= 65536.\n"
                          "(" +
-                         Twine(reinterpret_cast<uintptr_t>(&FI)) + ")");
+                         Twine(reinterpret_cast<uintptr_t>(FI.get())) + ")");
     }
 
     AP.OutStreamer->AddComment("live roots for " +
-                               Twine(FI.getFunction().getName()));
+                               Twine(FI->getFunction().getName()));
     AP.OutStreamer->AddBlankLine();
 
-    for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
-      size_t LiveCount = FI.live_size(J);
+    for (GCFunctionInfo::iterator J = FI->begin(), JE = FI->end(); J != JE;
+         ++J) {
+      size_t LiveCount = FI->live_size(J);
       if (LiveCount >= 1 << 16) {
         // Very rude!
-        report_fatal_error("Function '" + FI.getFunction().getName() +
+        report_fatal_error("Function '" + FI->getFunction().getName() +
                            "' is too large for the ocaml GC! "
                            "Live root count " +
                            Twine(LiveCount) + " >= 65536.");
@@ -169,8 +164,8 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
       AP.emitInt16(FrameSize);
       AP.emitInt16(LiveCount);
 
-      for (GCFunctionInfo::live_iterator K = FI.live_begin(J),
-                                         KE = FI.live_end(J);
+      for (GCFunctionInfo::live_iterator K = FI->live_begin(J),
+                                         KE = FI->live_end(J);
            K != KE; ++K) {
         if (K->StackOffset >= 1 << 16) {
           // Very rude!
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
index 9e6f1a537de3..bab187f46535 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
@@ -47,7 +47,6 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
     InlinedAt = InlinedAt->getInlinedAt();
   }
 
-  SmallVector<InlineSite, 8> InlineStack(ReversedInlineStack.rbegin(),
-                                         ReversedInlineStack.rend());
+  SmallVector<InlineSite, 8> InlineStack(llvm::reverse(ReversedInlineStack));
   Asm->OutStreamer->emitPseudoProbe(Guid, Index, Type, Attr, InlineStack);
 }
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 64dadc82b48b..0ff67f7ca00a 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1125,8 +1125,8 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
     // If this is a large problem, avoid visiting the same basic blocks multiple
     // times.
     if (MergePotentials.size() == TailMergeThreshold)
-      for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
-        TriedMerging.insert(MergePotentials[i].getBlock());
+      for (MergePotentialsElt &Elt : MergePotentials)
+        TriedMerging.insert(Elt.getBlock());
 
     if (MergePotentials.size() >= 2)
       MadeChange |= TryTailMergeBlocks(IBB, PredBB, MinCommonTailLength);
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 863a0e1e0b56..5f9982cd155d 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -15,13 +15,13 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/CodeGen/StackMaps.h"
 #include <cassert>
 #include <tuple>
 
@@ -35,7 +35,7 @@ void VirtRegAuxInfo::calculateSpillWeightsAndHints() {
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned Reg = Register::index2VirtReg(I);
+    Register Reg = Register::index2VirtReg(I);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
     calculateSpillWeightAndHint(LIS.getInterval(Reg));
@@ -64,14 +64,14 @@ static Register copyHint(const MachineInstr *MI, unsigned Reg,
   if (Register::isVirtualRegister(HReg))
     return Sub == HSub ? HReg : Register();
 
-  const TargetRegisterClass *rc = MRI.getRegClass(Reg);
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
   MCRegister CopiedPReg = HSub ? TRI.getSubReg(HReg, HSub) : HReg.asMCReg();
-  if (rc->contains(CopiedPReg))
+  if (RC->contains(CopiedPReg))
     return CopiedPReg;
 
   // Check if reg:sub matches so that a super register could be hinted.
   if (Sub)
-    return TRI.getMatchingSuperReg(CopiedPReg, Sub, rc);
+    return TRI.getMatchingSuperReg(CopiedPReg, Sub, RC);
 
   return 0;
 }
@@ -80,8 +80,8 @@ static Register copyHint(const MachineInstr *MI, unsigned Reg,
 static bool isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS,
                                const VirtRegMap &VRM,
                                const TargetInstrInfo &TII) {
-  unsigned Reg = LI.reg();
-  unsigned Original = VRM.getOriginal(Reg);
+  Register Reg = LI.reg();
+  Register Original = VRM.getOriginal(Reg);
   for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
        I != E; ++I) {
     const VNInfo *VNI = *I;
@@ -183,8 +183,8 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
   bool ShouldUpdateLI = !IsLocalSplitArtifact;
 
   if (IsLocalSplitArtifact) {
-    MachineBasicBlock *localMBB = LIS.getMBBFromIndex(*End);
-    assert(localMBB == LIS.getMBBFromIndex(*Start) &&
+    MachineBasicBlock *LocalMBB = LIS.getMBBFromIndex(*End);
+    assert(LocalMBB == LIS.getMBBFromIndex(*Start) &&
            "start and end are expected to be in the same basic block");
 
     // Local split artifact will have 2 additional copy instructions and they
@@ -192,8 +192,8 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
     // localLI = COPY other
     // ...
     // other   = COPY localLI
-    TotalWeight += LiveIntervals::getSpillWeight(true, false, &MBFI, localMBB);
-    TotalWeight += LiveIntervals::getSpillWeight(false, true, &MBFI, localMBB);
+    TotalWeight += LiveIntervals::getSpillWeight(true, false, &MBFI, LocalMBB);
+    TotalWeight += LiveIntervals::getSpillWeight(false, true, &MBFI, LocalMBB);
 
     NumInstr += 2;
   }
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index bbdd8aab502e..7c236a9785d8 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -68,6 +68,8 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineCSEPass(Registry);
   initializeMachineCombinerPass(Registry);
   initializeMachineCopyPropagationPass(Registry);
+  initializeMachineCycleInfoPrinterPassPass(Registry);
+  initializeMachineCycleInfoWrapperPassPass(Registry);
   initializeMachineDominatorTreePass(Registry);
   initializeMachineFunctionPrinterPassPass(Registry);
   initializeMachineLICMPass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index ac4180c4c3ab..747f4e4fdecc 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -4831,9 +4831,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
   TargetLowering::AsmOperandInfoVector TargetConstraints =
       TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI);
 
-  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
-    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
-
+  for (TargetLowering::AsmOperandInfo &OpInfo : TargetConstraints) {
     // Compute the constraint code and ConstraintType to use.
     TLI.ComputeConstraintToUse(OpInfo, SDValue());
 
@@ -5617,9 +5615,7 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
   TargetLowering::AsmOperandInfoVector TargetConstraints =
       TLI->ParseConstraints(*DL, TRI, *CS);
   unsigned ArgNo = 0;
-  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
-    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
-
+  for (TargetLowering::AsmOperandInfo &OpInfo : TargetConstraints) {
     // Compute the constraint code and ConstraintType to use.
     TLI->ComputeConstraintToUse(OpInfo, SDValue());
 
@@ -6856,8 +6852,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   // Use reverse iterator because later select may use the value of the
   // earlier select, and we need to propagate value through earlier select
   // to get the PHI operand.
-  for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
-    SelectInst *SI = *It;
+  for (SelectInst *SI : llvm::reverse(ASI)) {
     // The select itself is replaced with a PHI Node.
     PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
     PN->takeName(SI);
diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 4e98d49206b5..901409ea9f8f 100644
--- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -405,8 +405,7 @@ findSuitableFreeRegister(RegRefIter RegRefBegin,
                          const TargetRegisterClass *RC,
                          SmallVectorImpl<unsigned> &Forbid) {
   ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(RC);
-  for (unsigned i = 0; i != Order.size(); ++i) {
-    unsigned NewReg = Order[i];
+  for (unsigned NewReg : Order) {
     // Don't replace a register with itself.
     if (NewReg == AntiDepReg) continue;
     // Don't replace a register with one that was recently used to repair
diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index 0bb186a02416..5579152f1ce0 100644
--- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -142,9 +142,9 @@ bool DeadMachineInstructionElim::eliminateDeadMI(MachineFunction &MF) {
       if (isDead(&MI)) {
         LLVM_DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << MI);
         // It is possible that some DBG_VALUE instructions refer to this
-        // instruction.  They get marked as undef and will be deleted
-        // in the live debug variable analysis.
-        MI.eraseFromParentAndMarkDBGValuesForRemoval();
+        // instruction. They will be deleted in the live debug variable
+        // analysis.
+        MI.eraseFromParent();
         AnyChanges = true;
         ++NumDeletes;
         continue;
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 90883212a275..0b5469b02637 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -210,9 +210,9 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
 
   // Check all instructions, except the terminators. It is assumed that
   // terminators never have side effects or define any used register values.
-  for (MachineBasicBlock::iterator I = MBB->begin(),
-       E = MBB->getFirstTerminator(); I != E; ++I) {
-    if (I->isDebugInstr())
+  for (MachineInstr &MI :
+       llvm::make_range(MBB->begin(), MBB->getFirstTerminator())) {
+    if (MI.isDebugInstr())
       continue;
 
     if (++InstrCount > BlockInstrLimit && !Stress) {
@@ -222,28 +222,28 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
     }
 
     // There shouldn't normally be any phis in a single-predecessor block.
-    if (I->isPHI()) {
-      LLVM_DEBUG(dbgs() << "Can't hoist: " << *I);
+    if (MI.isPHI()) {
+      LLVM_DEBUG(dbgs() << "Can't hoist: " << MI);
       return false;
     }
 
     // Don't speculate loads. Note that it may be possible and desirable to
     // speculate GOT or constant pool loads that are guaranteed not to trap,
     // but we don't support that for now.
-    if (I->mayLoad()) {
-      LLVM_DEBUG(dbgs() << "Won't speculate load: " << *I);
+    if (MI.mayLoad()) {
+      LLVM_DEBUG(dbgs() << "Won't speculate load: " << MI);
       return false;
     }
 
     // We never speculate stores, so an AA pointer isn't necessary.
     bool DontMoveAcrossStore = true;
-    if (!I->isSafeToMove(nullptr, DontMoveAcrossStore)) {
-      LLVM_DEBUG(dbgs() << "Can't speculate: " << *I);
+    if (!MI.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+      LLVM_DEBUG(dbgs() << "Can't speculate: " << MI);
       return false;
     }
 
     // Check for any dependencies on Head instructions.
-    if (!InstrDependenciesAllowIfConv(&(*I)))
+    if (!InstrDependenciesAllowIfConv(&MI))
       return false;
   }
   return true;
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 17094a8e44f8..d061664e8c5d 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -256,7 +256,7 @@ mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
   LLT PartLLT = MRI.getType(SrcRegs[0]);
 
   // Deal with v3s16 split into v2s16
-  LLT LCMTy = getLCMType(LLTy, PartLLT);
+  LLT LCMTy = getCoverTy(LLTy, PartLLT);
   if (LCMTy == LLTy) {
     // Common case where no padding is needed.
     assert(DstRegs.size() == 1);
@@ -267,21 +267,9 @@ mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
   // widening the original value.
   Register UnmergeSrcReg;
   if (LCMTy != PartLLT) {
-    // e.g. A <3 x s16> value was split to <2 x s16>
-    // %register_value0:_(<2 x s16>)
-    // %register_value1:_(<2 x s16>)
-    // %undef:_(<2 x s16>) = G_IMPLICIT_DEF
-    // %concat:_<6 x s16>) = G_CONCAT_VECTORS %reg_value0, %reg_value1, %undef
-    // %dst_reg:_(<3 x s16>), %dead:_(<3 x s16>) = G_UNMERGE_VALUES %concat
-    const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
-    Register Undef = B.buildUndef(PartLLT).getReg(0);
-
-    // Build vector of undefs.
-    SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
-
-    // Replace the first sources with the real registers.
-    std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
-    UnmergeSrcReg = B.buildConcatVectors(LCMTy, WidenedSrcs).getReg(0);
+    assert(DstRegs.size() == 1);
+    return B.buildDeleteTrailingVectorElements(DstRegs[0],
+                                               B.buildMerge(LCMTy, SrcRegs));
   } else {
     // We don't need to widen anything if we're extracting a scalar which was
     // promoted to a vector e.g. s8 -> v4s8 -> s8
@@ -298,6 +286,8 @@ mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
   for (int I = DstRegs.size(); I != NumDst; ++I)
     PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
 
+  if (PadDstRegs.size() == 1)
+    return B.buildDeleteTrailingVectorElements(DstRegs[0], UnmergeSrcReg);
   return B.buildUnmerge(PadDstRegs, UnmergeSrcReg);
 }
 
@@ -485,7 +475,7 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
 
   MachineRegisterInfo &MRI = *B.getMRI();
   LLT DstTy = MRI.getType(DstRegs[0]);
-  LLT LCMTy = getLCMType(SrcTy, PartTy);
+  LLT LCMTy = getCoverTy(SrcTy, PartTy);
 
   const unsigned DstSize = DstTy.getSizeInBits();
   const unsigned SrcSize = SrcTy.getSizeInBits();
@@ -493,7 +483,7 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
 
   Register UnmergeSrc = SrcReg;
 
-  if (CoveringSize != SrcSize) {
+  if (!LCMTy.isVector() && CoveringSize != SrcSize) {
     // For scalars, it's common to be able to use a simple extension.
     if (SrcTy.isScalar() && DstTy.isScalar()) {
       CoveringSize = alignTo(SrcSize, DstSize);
@@ -510,14 +500,10 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
     }
   }
 
-  // Unmerge to the original registers and pad with dead defs.
-  SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end());
-  for (unsigned Size = DstSize * DstRegs.size(); Size != CoveringSize;
-       Size += DstSize) {
-    UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy));
-  }
+  if (LCMTy.isVector() && CoveringSize != SrcSize)
+    UnmergeSrc = B.buildPadVectorWithUndefElements(LCMTy, SrcReg).getReg(0);
 
-  B.buildUnmerge(UnmergeResults, UnmergeSrc);
+  B.buildUnmerge(DstRegs, UnmergeSrc);
 }
 
 bool CallLowering::determineAndHandleAssignments(
diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
index 381c6df5c97a..dd1ef74e8ad0 100644
--- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -135,7 +135,7 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF,
         // Erase dead insts before even adding to the list.
         if (isTriviallyDead(CurMI, *MRI)) {
           LLVM_DEBUG(dbgs() << CurMI << "Is dead; erasing.\n");
-          CurMI.eraseFromParentAndMarkDBGValuesForRemoval();
+          CurMI.eraseFromParent();
           continue;
         }
         WorkList.deferred_insert(&CurMI);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 755b3b844570..f7a634dad61a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1551,8 +1551,8 @@ void CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI,
   Builder.buildInstr(MatchInfo.Logic->getOpcode(), {Dest}, {Shift1, Shift2});
 
   // These were one use so it's safe to remove them.
-  MatchInfo.Shift2->eraseFromParentAndMarkDBGValuesForRemoval();
-  MatchInfo.Logic->eraseFromParentAndMarkDBGValuesForRemoval();
+  MatchInfo.Shift2->eraseFromParent();
+  MatchInfo.Logic->eraseFromParent();
 
   MI.eraseFromParent();
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 87cc60d51bc2..6d415c9c7f90 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -338,9 +338,10 @@ bool IRTranslator::translateCompare(const User &U,
     MIRBuilder.buildCopy(
         Res, getOrCreateVReg(*Constant::getAllOnesValue(U.getType())));
   else {
-    assert(CI && "Instruction should be CmpInst");
-    MIRBuilder.buildFCmp(Pred, Res, Op0, Op1,
-                         MachineInstr::copyFlagsFromInstruction(*CI));
+    uint16_t Flags = 0;
+    if (CI)
+      Flags = MachineInstr::copyFlagsFromInstruction(*CI);
+    MIRBuilder.buildFCmp(Pred, Res, Op0, Op1, Flags);
   }
 
   return true;
@@ -3502,7 +3503,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   // Get rid of the now empty basic block.
   EntryBB->removeSuccessor(&NewEntryBB);
   MF->remove(EntryBB);
-  MF->DeleteMachineBasicBlock(EntryBB);
+  MF->deleteMachineBasicBlock(EntryBB);
 
   assert(&MF->front() == &NewEntryBB &&
          "New entry wasn't next in the list of basic block!");
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 9b2692486384..b10c9272a508 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -163,7 +163,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
       // If so, erase it.
       if (isTriviallyDead(MI, MRI)) {
         LLVM_DEBUG(dbgs() << "Is dead; erasing.\n");
-        MI.eraseFromParentAndMarkDBGValuesForRemoval();
+        MI.eraseFromParent();
         continue;
       }
 
@@ -255,8 +255,12 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
     MachineInstr *MI = nullptr;
     if (!MRI.def_empty(VReg))
       MI = &*MRI.def_instr_begin(VReg);
-    else if (!MRI.use_empty(VReg))
+    else if (!MRI.use_empty(VReg)) {
       MI = &*MRI.use_instr_begin(VReg);
+      // Debug value instruction is permitted to use undefined vregs.
+      if (MI->isDebugValue())
+        continue;
+    }
     if (!MI)
       continue;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index e09cd26eb0c1..e8a8efd5dad4 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -176,16 +176,18 @@ bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
     return true;
   }
 
+  // Perform irregular split. Leftover is last element of RegPieces.
   if (MainTy.isVector()) {
-    unsigned EltSize = MainTy.getScalarSizeInBits();
-    if (LeftoverSize % EltSize != 0)
-      return false;
-    LeftoverTy = LLT::scalarOrVector(
-        ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
-  } else {
-    LeftoverTy = LLT::scalar(LeftoverSize);
+    SmallVector<Register, 8> RegPieces;
+    extractVectorParts(Reg, MainTy.getNumElements(), RegPieces);
+    for (unsigned i = 0; i < RegPieces.size() - 1; ++i)
+      VRegs.push_back(RegPieces[i]);
+    LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]);
+    LeftoverTy = MRI.getType(LeftoverRegs[0]);
+    return true;
   }
 
+  LeftoverTy = LLT::scalar(LeftoverSize);
   // For irregular sizes, extract the individual parts.
   for (unsigned I = 0; I != NumParts; ++I) {
     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
@@ -203,6 +205,44 @@ bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
   return true;
 }
 
+void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts,
+                                         SmallVectorImpl<Register> &VRegs) {
+  LLT RegTy = MRI.getType(Reg);
+  assert(RegTy.isVector() && "Expected a vector type");
+
+  LLT EltTy = RegTy.getElementType();
+  LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
+  unsigned RegNumElts = RegTy.getNumElements();
+  unsigned LeftoverNumElts = RegNumElts % NumElts;
+  unsigned NumNarrowTyPieces = RegNumElts / NumElts;
+
+  // Perfect split without leftover
+  if (LeftoverNumElts == 0)
+    return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs);
+
+  // Irregular split. Provide direct access to all elements for artifact
+  // combiner using unmerge to elements. Then build vectors with NumElts
+  // elements. Remaining element(s) will be (used to build vector) Leftover.
+  SmallVector<Register, 8> Elts;
+  extractParts(Reg, EltTy, RegNumElts, Elts);
+
+  unsigned Offset = 0;
+  // Requested sub-vectors of NarrowTy.
+  for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) {
+    ArrayRef<Register> Pieces(&Elts[Offset], NumElts);
+    VRegs.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
+  }
+
+  // Leftover element(s).
+  if (LeftoverNumElts == 1) {
+    VRegs.push_back(Elts[Offset]);
+  } else {
+    LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy);
+    ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts);
+    VRegs.push_back(MIRBuilder.buildMerge(LeftoverTy, Pieces).getReg(0));
+  }
+}
+
 void LegalizerHelper::insertParts(Register DstReg,
                                   LLT ResultTy, LLT PartTy,
                                   ArrayRef<Register> PartRegs,
@@ -223,6 +263,15 @@ void LegalizerHelper::insertParts(Register DstReg,
     return;
   }
 
+  // Merge sub-vectors with different number of elements and insert into DstReg.
+  if (ResultTy.isVector()) {
+    assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
+    SmallVector<Register, 8> AllRegs;
+    for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs))
+      AllRegs.push_back(Reg);
+    return mergeMixedSubvectors(DstReg, AllRegs);
+  }
+
   SmallVector<Register> GCDRegs;
   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
@@ -231,6 +280,30 @@ void LegalizerHelper::insertParts(Register DstReg,
   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
 }
 
+void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
+                                       Register Reg) {
+  LLT Ty = MRI.getType(Reg);
+  SmallVector<Register, 8> RegElts;
+  extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts);
+  Elts.append(RegElts);
+}
+
+/// Merge \p PartRegs with different types into \p DstReg.
+void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
+                                           ArrayRef<Register> PartRegs) {
+  SmallVector<Register, 8> AllElts;
+  for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
+    appendVectorElts(AllElts, PartRegs[i]);
+
+  Register Leftover = PartRegs[PartRegs.size() - 1];
+  if (MRI.getType(Leftover).isScalar())
+    AllElts.push_back(Leftover);
+  else
+    appendVectorElts(AllElts, Leftover);
+
+  MIRBuilder.buildMerge(DstReg, AllElts);
+}
+
 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
                               const MachineInstr &MI) {
@@ -916,8 +989,26 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     return Legalized;
   }
 
-  case TargetOpcode::G_FREEZE:
-    return reduceOperationWidth(MI, TypeIdx, NarrowTy);
+  case TargetOpcode::G_FREEZE: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    // Should widen scalar first
+    if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
+      return UnableToLegalize;
+
+    auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
+    SmallVector<Register, 8> Parts;
+    for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
+      Parts.push_back(
+          MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
+    }
+
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), Parts);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_ADD:
   case TargetOpcode::G_SUB:
   case TargetOpcode::G_SADDO:
@@ -1372,37 +1463,17 @@ void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
                                             unsigned OpIdx) {
   MachineOperand &MO = MI.getOperand(OpIdx);
   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
-  MO.setReg(widenWithUnmerge(WideTy, MO.getReg()));
+  Register Dst = MO.getReg();
+  Register DstExt = MRI.createGenericVirtualRegister(WideTy);
+  MO.setReg(DstExt);
+  MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
 }
 
 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
                                             unsigned OpIdx) {
   MachineOperand &MO = MI.getOperand(OpIdx);
-
-  LLT OldTy = MRI.getType(MO.getReg());
-  unsigned OldElts = OldTy.getNumElements();
-  unsigned NewElts = MoreTy.getNumElements();
-
-  unsigned NumParts = NewElts / OldElts;
-
-  // Use concat_vectors if the result is a multiple of the number of elements.
-  if (NumParts * OldElts == NewElts) {
-    SmallVector<Register, 8> Parts;
-    Parts.push_back(MO.getReg());
-
-    Register ImpDef = MIRBuilder.buildUndef(OldTy).getReg(0);
-    for (unsigned I = 1; I != NumParts; ++I)
-      Parts.push_back(ImpDef);
-
-    auto Concat = MIRBuilder.buildConcatVectors(MoreTy, Parts);
-    MO.setReg(Concat.getReg(0));
-    return;
-  }
-
-  Register MoreReg = MRI.createGenericVirtualRegister(MoreTy);
-  Register ImpDef = MIRBuilder.buildUndef(MoreTy).getReg(0);
-  MIRBuilder.buildInsert(MoreReg, ImpDef, MO.getReg(), 0);
-  MO.setReg(MoreReg);
+  SmallVector<Register, 8> Regs;
+  MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
 }
 
 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
@@ -3558,20 +3629,83 @@ Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
 }
 
-LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef(
-    MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) {
-  Register DstReg = MI.getOperand(0).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-  LLT LCMTy = getLCMType(DstTy, NarrowTy);
+#ifndef NDEBUG
+/// Check that all vector operands have same number of elements. Other operands
+/// should be listed in NonVecOp.
+static bool hasSameNumEltsOnAllVectorOperands(
+    GenericMachineInstr &MI, MachineRegisterInfo &MRI,
+    std::initializer_list<unsigned> NonVecOpIndices) {
+  if (MI.getNumMemOperands() != 0)
+    return false;
 
-  unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
+  LLT VecTy = MRI.getType(MI.getReg(0));
+  if (!VecTy.isVector())
+    return false;
+  unsigned NumElts = VecTy.getNumElements();
 
-  auto NewUndef = MIRBuilder.buildUndef(NarrowTy);
-  SmallVector<Register, 8> Parts(NumParts, NewUndef.getReg(0));
+  for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
+    MachineOperand &Op = MI.getOperand(OpIdx);
+    if (!Op.isReg()) {
+      if (!is_contained(NonVecOpIndices, OpIdx))
+        return false;
+      continue;
+    }
 
-  buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
-  MI.eraseFromParent();
-  return Legalized;
+    LLT Ty = MRI.getType(Op.getReg());
+    if (!Ty.isVector()) {
+      if (!is_contained(NonVecOpIndices, OpIdx))
+        return false;
+      is_contained(NonVecOpIndices, OpIdx);
+      continue;
+    }
+
+    if (Ty.getNumElements() != NumElts)
+      return false;
+  }
+
+  return true;
+}
+#endif
+
+/// Fill \p DstOps with DstOps that have same number of elements combined as
+/// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
+/// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
+/// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
+static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
+                       unsigned NumElts) {
+  LLT LeftoverTy;
+  assert(Ty.isVector() && "Expected vector type");
+  LLT EltTy = Ty.getElementType();
+  LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
+  int NumParts, NumLeftover;
+  std::tie(NumParts, NumLeftover) =
+      getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
+
+  assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
+  for (int i = 0; i < NumParts; ++i) {
+    DstOps.push_back(NarrowTy);
+  }
+
+  if (LeftoverTy.isValid()) {
+    assert(NumLeftover == 1 && "expected exactly one leftover");
+    DstOps.push_back(LeftoverTy);
+  }
+}
+
+/// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
+/// made from \p Op depending on operand type.
+static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
+                           MachineOperand &Op) {
+  for (unsigned i = 0; i < N; ++i) {
+    if (Op.isReg())
+      Ops.push_back(Op.getReg());
+    else if (Op.isImm())
+      Ops.push_back(Op.getImm());
+    else if (Op.isPredicate())
+      Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
+    else
+      llvm_unreachable("Unsupported type");
+  }
 }
 
 // Handle splitting vector operations which need to have the same number of
@@ -3588,335 +3722,116 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef(
 //             s64 = G_SHL s64, s32
 LegalizerHelper::LegalizeResult
 LegalizerHelper::fewerElementsVectorMultiEltType(
-  MachineInstr &MI, unsigned TypeIdx, LLT NarrowTyArg) {
-  if (TypeIdx != 0)
-    return UnableToLegalize;
+    GenericMachineInstr &MI, unsigned NumElts,
+    std::initializer_list<unsigned> NonVecOpIndices) {
+  assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
+         "Non-compatible opcode or not specified non-vector operands");
+  unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
 
-  const LLT NarrowTy0 = NarrowTyArg;
-  const Register DstReg = MI.getOperand(0).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-  LLT LeftoverTy0;
-
-  // All of the operands need to have the same number of elements, so if we can
-  // determine a type breakdown for the result type, we can for all of the
-  // source types.
-  int NumParts = getNarrowTypeBreakDown(DstTy, NarrowTy0, LeftoverTy0).first;
-  if (NumParts < 0)
-    return UnableToLegalize;
+  unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
+  unsigned NumDefs = MI.getNumDefs();
 
-  SmallVector<MachineInstrBuilder, 4> NewInsts;
-
-  SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
-  SmallVector<Register, 4> PartRegs, LeftoverRegs;
-
-  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
-    Register SrcReg = MI.getOperand(I).getReg();
-    LLT SrcTyI = MRI.getType(SrcReg);
-    const auto NewEC = NarrowTy0.isVector() ? NarrowTy0.getElementCount()
-                                            : ElementCount::getFixed(1);
-    LLT NarrowTyI = LLT::scalarOrVector(NewEC, SrcTyI.getScalarType());
-    LLT LeftoverTyI;
-
-    // Split this operand into the requested typed registers, and any leftover
-    // required to reproduce the original type.
-    if (!extractParts(SrcReg, SrcTyI, NarrowTyI, LeftoverTyI, PartRegs,
-                      LeftoverRegs))
-      return UnableToLegalize;
-
-    if (I == 1) {
-      // For the first operand, create an instruction for each part and setup
-      // the result.
-      for (Register PartReg : PartRegs) {
-        Register PartDstReg = MRI.createGenericVirtualRegister(NarrowTy0);
-        NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
-                               .addDef(PartDstReg)
-                               .addUse(PartReg));
-        DstRegs.push_back(PartDstReg);
-      }
+  // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
+  // Build instructions with DstOps to use instruction found by CSE directly.
+  // CSE copies found instruction into given vreg when building with vreg dest.
+  SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
+  // Output registers will be taken from created instructions.
+  SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
+  for (unsigned i = 0; i < NumDefs; ++i) {
+    makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
+  }
 
-      for (Register LeftoverReg : LeftoverRegs) {
-        Register PartDstReg = MRI.createGenericVirtualRegister(LeftoverTy0);
-        NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
-                               .addDef(PartDstReg)
-                               .addUse(LeftoverReg));
-        LeftoverDstRegs.push_back(PartDstReg);
-      }
+  // Split vector input operands into sub-vectors with NumElts elts + Leftover.
+  // Operands listed in NonVecOpIndices will be used as is without splitting;
+  // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
+  // scalar condition (op 1), immediate in sext_inreg (op 2).
+  SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
+  for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
+       ++UseIdx, ++UseNo) {
+    if (is_contained(NonVecOpIndices, UseIdx)) {
+      broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
+                     MI.getOperand(UseIdx));
     } else {
-      assert(NewInsts.size() == PartRegs.size() + LeftoverRegs.size());
-
-      // Add the newly created operand splits to the existing instructions. The
-      // odd-sized pieces are ordered after the requested NarrowTyArg sized
-      // pieces.
-      unsigned InstCount = 0;
-      for (unsigned J = 0, JE = PartRegs.size(); J != JE; ++J)
-        NewInsts[InstCount++].addUse(PartRegs[J]);
-      for (unsigned J = 0, JE = LeftoverRegs.size(); J != JE; ++J)
-        NewInsts[InstCount++].addUse(LeftoverRegs[J]);
+      SmallVector<Register, 8> SplitPieces;
+      extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces);
+      for (auto Reg : SplitPieces)
+        InputOpsPieces[UseNo].push_back(Reg);
     }
-
-    PartRegs.clear();
-    LeftoverRegs.clear();
   }
 
-  // Insert the newly built operations and rebuild the result register.
-  for (auto &MIB : NewInsts)
-    MIRBuilder.insertInstr(MIB);
+  unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
 
-  insertParts(DstReg, DstTy, NarrowTy0, DstRegs, LeftoverTy0, LeftoverDstRegs);
+  // Take i-th piece of each input operand split and build sub-vector/scalar
+  // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
+  for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
+    SmallVector<DstOp, 2> Defs;
+    for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
+      Defs.push_back(OutputOpsPieces[DstNo][i]);
 
-  MI.eraseFromParent();
-  return Legalized;
-}
+    SmallVector<SrcOp, 3> Uses;
+    for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
+      Uses.push_back(InputOpsPieces[InputNo][i]);
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx,
-                                          LLT NarrowTy) {
-  if (TypeIdx != 0)
-    return UnableToLegalize;
-
-  Register DstReg = MI.getOperand(0).getReg();
-  Register SrcReg = MI.getOperand(1).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-  LLT SrcTy = MRI.getType(SrcReg);
-
-  LLT NarrowTy0 = NarrowTy;
-  LLT NarrowTy1;
-  unsigned NumParts;
-
-  if (NarrowTy.isVector()) {
-    // Uneven breakdown not handled.
-    NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
-    if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements())
-      return UnableToLegalize;
-
-    NarrowTy1 = LLT::vector(NarrowTy.getElementCount(), SrcTy.getElementType());
-  } else {
-    NumParts = DstTy.getNumElements();
-    NarrowTy1 = SrcTy.getElementType();
+    auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
+    for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
+      OutputRegs[DstNo].push_back(I.getReg(DstNo));
   }
 
-  SmallVector<Register, 4> SrcRegs, DstRegs;
-  extractParts(SrcReg, NarrowTy1, NumParts, SrcRegs);
-
-  for (unsigned I = 0; I < NumParts; ++I) {
-    Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
-    MachineInstr *NewInst =
-        MIRBuilder.buildInstr(MI.getOpcode(), {DstReg}, {SrcRegs[I]});
-
-    NewInst->setFlags(MI.getFlags());
-    DstRegs.push_back(DstReg);
+  // Merge small outputs into MI's output for each def operand.
+  if (NumLeftovers) {
+    for (unsigned i = 0; i < NumDefs; ++i)
+      mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
+  } else {
+    for (unsigned i = 0; i < NumDefs; ++i)
+      MIRBuilder.buildMerge(MI.getReg(i), OutputRegs[i]);
   }
 
-  if (NarrowTy.isVector())
-    MIRBuilder.buildConcatVectors(DstReg, DstRegs);
-  else
-    MIRBuilder.buildBuildVector(DstReg, DstRegs);
-
   MI.eraseFromParent();
   return Legalized;
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::fewerElementsVectorCmp(MachineInstr &MI, unsigned TypeIdx,
-                                        LLT NarrowTy) {
-  Register DstReg = MI.getOperand(0).getReg();
-  Register Src0Reg = MI.getOperand(2).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-  LLT SrcTy = MRI.getType(Src0Reg);
+LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
+                                        unsigned NumElts) {
+  unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
 
-  unsigned NumParts;
-  LLT NarrowTy0, NarrowTy1;
+  unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
+  unsigned NumDefs = MI.getNumDefs();
 
-  if (TypeIdx == 0) {
-    unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
-    unsigned OldElts = DstTy.getNumElements();
-
-    NarrowTy0 = NarrowTy;
-    NumParts = NarrowTy.isVector() ? (OldElts / NewElts) : DstTy.getNumElements();
-    NarrowTy1 = NarrowTy.isVector() ? LLT::vector(NarrowTy.getElementCount(),
-                                                  SrcTy.getScalarSizeInBits())
-                                    : SrcTy.getElementType();
-
-  } else {
-    unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
-    unsigned OldElts = SrcTy.getNumElements();
+  SmallVector<DstOp, 8> OutputOpsPieces;
+  SmallVector<Register, 8> OutputRegs;
+  makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
 
-    NumParts = NarrowTy.isVector() ? (OldElts / NewElts) :
-      NarrowTy.getNumElements();
-    NarrowTy0 =
-        LLT::vector(NarrowTy.getElementCount(), DstTy.getScalarSizeInBits());
-    NarrowTy1 = NarrowTy;
+  // Instructions that perform register split will be inserted in basic block
+  // where register is defined (basic block is in the next operand).
+  SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
+  for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
+       UseIdx += 2, ++UseNo) {
+    MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
+    MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
+    extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]);
   }
 
-  // FIXME: Don't know how to handle the situation where the small vectors
-  // aren't all the same size yet.
-  if (NarrowTy1.isVector() &&
-      NarrowTy1.getNumElements() * NumParts != DstTy.getNumElements())
-    return UnableToLegalize;
-
-  CmpInst::Predicate Pred
-    = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+  // Build PHIs with fewer elements.
+  unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
+  MIRBuilder.setInsertPt(*MI.getParent(), MI);
+  for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
+    auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
+    Phi.addDef(
+        MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
+    OutputRegs.push_back(Phi.getReg(0));
 
-  SmallVector<Register, 2> Src1Regs, Src2Regs, DstRegs;
-  extractParts(MI.getOperand(2).getReg(), NarrowTy1, NumParts, Src1Regs);
-  extractParts(MI.getOperand(3).getReg(), NarrowTy1, NumParts, Src2Regs);
-
-  for (unsigned I = 0; I < NumParts; ++I) {
-    Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
-    DstRegs.push_back(DstReg);
-
-    if (MI.getOpcode() == TargetOpcode::G_ICMP)
-      MIRBuilder.buildICmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
-    else {
-      MachineInstr *NewCmp
-        = MIRBuilder.buildFCmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
-      NewCmp->setFlags(MI.getFlags());
+    for (unsigned j = 0; j < NumInputs / 2; ++j) {
+      Phi.addUse(InputOpsPieces[j][i]);
+      Phi.add(MI.getOperand(1 + j * 2 + 1));
     }
   }
 
-  if (NarrowTy1.isVector())
-    MIRBuilder.buildConcatVectors(DstReg, DstRegs);
-  else
-    MIRBuilder.buildBuildVector(DstReg, DstRegs);
-
-  MI.eraseFromParent();
-  return Legalized;
-}
-
-LegalizerHelper::LegalizeResult
-LegalizerHelper::fewerElementsVectorSelect(MachineInstr &MI, unsigned TypeIdx,
-                                           LLT NarrowTy) {
-  Register DstReg = MI.getOperand(0).getReg();
-  Register CondReg = MI.getOperand(1).getReg();
-
-  unsigned NumParts = 0;
-  LLT NarrowTy0, NarrowTy1;
-
-  LLT DstTy = MRI.getType(DstReg);
-  LLT CondTy = MRI.getType(CondReg);
-  unsigned Size = DstTy.getSizeInBits();
-
-  assert(TypeIdx == 0 || CondTy.isVector());
-
-  if (TypeIdx == 0) {
-    NarrowTy0 = NarrowTy;
-    NarrowTy1 = CondTy;
-
-    unsigned NarrowSize = NarrowTy0.getSizeInBits();
-    // FIXME: Don't know how to handle the situation where the small vectors
-    // aren't all the same size yet.
-    if (Size % NarrowSize != 0)
-      return UnableToLegalize;
-
-    NumParts = Size / NarrowSize;
-
-    // Need to break down the condition type
-    if (CondTy.isVector()) {
-      if (CondTy.getNumElements() == NumParts)
-        NarrowTy1 = CondTy.getElementType();
-      else
-        NarrowTy1 =
-            LLT::vector(CondTy.getElementCount().divideCoefficientBy(NumParts),
-                        CondTy.getScalarSizeInBits());
-    }
+  // Merge small outputs into MI's def.
+  if (NumLeftovers) {
+    mergeMixedSubvectors(MI.getReg(0), OutputRegs);
   } else {
-    NumParts = CondTy.getNumElements();
-    if (NarrowTy.isVector()) {
-      // TODO: Handle uneven breakdown.
-      if (NumParts * NarrowTy.getNumElements() != CondTy.getNumElements())
-        return UnableToLegalize;
-
-      return UnableToLegalize;
-    } else {
-      NarrowTy0 = DstTy.getElementType();
-      NarrowTy1 = NarrowTy;
-    }
-  }
-
-  SmallVector<Register, 2> DstRegs, Src0Regs, Src1Regs, Src2Regs;
-  if (CondTy.isVector())
-    extractParts(MI.getOperand(1).getReg(), NarrowTy1, NumParts, Src0Regs);
-
-  extractParts(MI.getOperand(2).getReg(), NarrowTy0, NumParts, Src1Regs);
-  extractParts(MI.getOperand(3).getReg(), NarrowTy0, NumParts, Src2Regs);
-
-  for (unsigned i = 0; i < NumParts; ++i) {
-    Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
-    MIRBuilder.buildSelect(DstReg, CondTy.isVector() ? Src0Regs[i] : CondReg,
-                           Src1Regs[i], Src2Regs[i]);
-    DstRegs.push_back(DstReg);
-  }
-
-  if (NarrowTy0.isVector())
-    MIRBuilder.buildConcatVectors(DstReg, DstRegs);
-  else
-    MIRBuilder.buildBuildVector(DstReg, DstRegs);
-
-  MI.eraseFromParent();
-  return Legalized;
-}
-
-LegalizerHelper::LegalizeResult
-LegalizerHelper::fewerElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
-                                        LLT NarrowTy) {
-  const Register DstReg = MI.getOperand(0).getReg();
-  LLT PhiTy = MRI.getType(DstReg);
-  LLT LeftoverTy;
-
-  // All of the operands need to have the same number of elements, so if we can
-  // determine a type breakdown for the result type, we can for all of the
-  // source types.
-  int NumParts, NumLeftover;
-  std::tie(NumParts, NumLeftover)
-    = getNarrowTypeBreakDown(PhiTy, NarrowTy, LeftoverTy);
-  if (NumParts < 0)
-    return UnableToLegalize;
-
-  SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
-  SmallVector<MachineInstrBuilder, 4> NewInsts;
-
-  const int TotalNumParts = NumParts + NumLeftover;
-
-  // Insert the new phis in the result block first.
-  for (int I = 0; I != TotalNumParts; ++I) {
-    LLT Ty = I < NumParts ? NarrowTy : LeftoverTy;
-    Register PartDstReg = MRI.createGenericVirtualRegister(Ty);
-    NewInsts.push_back(MIRBuilder.buildInstr(TargetOpcode::G_PHI)
-                       .addDef(PartDstReg));
-    if (I < NumParts)
-      DstRegs.push_back(PartDstReg);
-    else
-      LeftoverDstRegs.push_back(PartDstReg);
-  }
-
-  MachineBasicBlock *MBB = MI.getParent();
-  MIRBuilder.setInsertPt(*MBB, MBB->getFirstNonPHI());
-  insertParts(DstReg, PhiTy, NarrowTy, DstRegs, LeftoverTy, LeftoverDstRegs);
-
-  SmallVector<Register, 4> PartRegs, LeftoverRegs;
-
-  // Insert code to extract the incoming values in each predecessor block.
-  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
-    PartRegs.clear();
-    LeftoverRegs.clear();
-
-    Register SrcReg = MI.getOperand(I).getReg();
-    MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
-    MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
-
-    LLT Unused;
-    if (!extractParts(SrcReg, PhiTy, NarrowTy, Unused, PartRegs,
-                      LeftoverRegs))
-      return UnableToLegalize;
-
-    // Add the newly created operand splits to the existing instructions. The
-    // odd-sized pieces are ordered after the requested NarrowTyArg sized
-    // pieces.
-    for (int J = 0; J != TotalNumParts; ++J) {
-      MachineInstrBuilder MIB = NewInsts[J];
-      MIB.addUse(J < NumParts ? PartRegs[J] : LeftoverRegs[J - NumParts]);
-      MIB.addMBB(&OpMBB);
-    }
+    MIRBuilder.buildMerge(MI.getReg(0), OutputRegs);
   }
 
   MI.eraseFromParent();
@@ -3927,27 +3842,36 @@ LegalizerHelper::LegalizeResult
 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
                                                   unsigned TypeIdx,
                                                   LLT NarrowTy) {
-  if (TypeIdx != 1)
-    return UnableToLegalize;
-
   const int NumDst = MI.getNumOperands() - 1;
   const Register SrcReg = MI.getOperand(NumDst).getReg();
-  LLT SrcTy = MRI.getType(SrcReg);
-
   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  LLT SrcTy = MRI.getType(SrcReg);
 
-  // TODO: Create sequence of extracts.
-  if (DstTy == NarrowTy)
+  if (TypeIdx != 1 || NarrowTy == DstTy)
     return UnableToLegalize;
 
-  LLT GCDTy = getGCDType(SrcTy, NarrowTy);
-  if (DstTy == GCDTy) {
-    // This would just be a copy of the same unmerge.
-    // TODO: Create extracts, pad with undef and create intermediate merges.
+  // Requires compatible types. Otherwise SrcReg should have been defined by
+  // merge-like instruction that would get artifact combined. Most likely
+  // instruction that defines SrcReg has to perform more/fewer elements
+  // legalization compatible with NarrowTy.
+  assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
+  assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
+
+  if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
+      (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
     return UnableToLegalize;
-  }
 
-  auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
+  // This is most likely DstTy (smaller then register size) packed in SrcTy
+  // (larger then register size) and since unmerge was not combined it will be
+  // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
+  // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
+
+  // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
+  //
+  // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
+  // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
+  // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
+  auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
   const int NumUnmerge = Unmerge->getNumOperands() - 1;
   const int PartsPerUnmerge = NumDst / NumUnmerge;
 
@@ -3964,89 +3888,87 @@ LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::fewerElementsVectorMulo(MachineInstr &MI, unsigned TypeIdx,
-                                         LLT NarrowTy) {
-  Register Result = MI.getOperand(0).getReg();
-  Register Overflow = MI.getOperand(1).getReg();
-  Register LHS = MI.getOperand(2).getReg();
-  Register RHS = MI.getOperand(3).getReg();
-
-  LLT SrcTy = MRI.getType(LHS);
-  if (!SrcTy.isVector())
+LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
+                                          LLT NarrowTy) {
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+  // Requires compatible types. Otherwise user of DstReg did not perform unmerge
+  // that should have been artifact combined. Most likely instruction that uses
+  // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
+  assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
+  assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
+  if (NarrowTy == SrcTy)
     return UnableToLegalize;
 
-  LLT ElementType = SrcTy.getElementType();
-  LLT OverflowElementTy = MRI.getType(Overflow).getElementType();
-  const ElementCount NumResult = SrcTy.getElementCount();
-  LLT GCDTy = getGCDType(SrcTy, NarrowTy);
+  // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
+  // is for old mir tests. Since the changes to more/fewer elements it should no
+  // longer be possible to generate MIR like this when starting from llvm-ir
+  // because LCMTy approach was replaced with merge/unmerge to vector elements.
+  if (TypeIdx == 1) {
+    assert(SrcTy.isVector() && "Expected vector types");
+    assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
+    if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
+        (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
+      return UnableToLegalize;
+    // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
+    //
+    // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
+    // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
+    // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
+    // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
+    // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
+    // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
 
-  // Unmerge the operands to smaller parts of GCD type.
-  auto UnmergeLHS = MIRBuilder.buildUnmerge(GCDTy, LHS);
-  auto UnmergeRHS = MIRBuilder.buildUnmerge(GCDTy, RHS);
+    SmallVector<Register, 8> Elts;
+    LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
+    for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
+      auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
+      for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
+        Elts.push_back(Unmerge.getReg(j));
+    }
 
-  const int NumOps = UnmergeLHS->getNumOperands() - 1;
-  const ElementCount PartsPerUnmerge = NumResult.divideCoefficientBy(NumOps);
-  LLT OverflowTy = LLT::scalarOrVector(PartsPerUnmerge, OverflowElementTy);
-  LLT ResultTy = LLT::scalarOrVector(PartsPerUnmerge, ElementType);
+    SmallVector<Register, 8> NarrowTyElts;
+    unsigned NumNarrowTyElts = NarrowTy.getNumElements();
+    unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
+    for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
+         ++i, Offset += NumNarrowTyElts) {
+      ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
+      NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
+    }
 
-  // Perform the operation over unmerged parts.
-  SmallVector<Register, 8> ResultParts;
-  SmallVector<Register, 8> OverflowParts;
-  for (int I = 0; I != NumOps; ++I) {
-    Register Operand1 = UnmergeLHS->getOperand(I).getReg();
-    Register Operand2 = UnmergeRHS->getOperand(I).getReg();
-    auto PartMul = MIRBuilder.buildInstr(MI.getOpcode(), {ResultTy, OverflowTy},
-                                         {Operand1, Operand2});
-    ResultParts.push_back(PartMul->getOperand(0).getReg());
-    OverflowParts.push_back(PartMul->getOperand(1).getReg());
+    MIRBuilder.buildMerge(DstReg, NarrowTyElts);
+    MI.eraseFromParent();
+    return Legalized;
   }
 
-  LLT ResultLCMTy = buildLCMMergePieces(SrcTy, NarrowTy, GCDTy, ResultParts);
-  LLT OverflowLCMTy =
-      LLT::scalarOrVector(ResultLCMTy.getElementCount(), OverflowElementTy);
-
-  // Recombine the pieces to the original result and overflow registers.
-  buildWidenedRemergeToDst(Result, ResultLCMTy, ResultParts);
-  buildWidenedRemergeToDst(Overflow, OverflowLCMTy, OverflowParts);
-  MI.eraseFromParent();
-  return Legalized;
-}
-
-// Handle FewerElementsVector a G_BUILD_VECTOR or G_CONCAT_VECTORS that produces
-// a vector
-//
-// Create a G_BUILD_VECTOR or G_CONCAT_VECTORS of NarrowTy pieces, padding with
-// undef as necessary.
-//
-// %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2
-//   -> <2 x s16>
-//
-// %4:_(s16) = G_IMPLICIT_DEF
-// %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1
-// %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4
-// %7:_(<2 x s16>) = G_IMPLICIT_DEF
-// %8:_(<6 x s16>) = G_CONCAT_VECTORS %5, %6, %7
-// %3:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %8
-LegalizerHelper::LegalizeResult
-LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
-                                          LLT NarrowTy) {
-  Register DstReg = MI.getOperand(0).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-  LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
-  LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
-
-  // Break into a common type
-  SmallVector<Register, 16> Parts;
-  for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
-    extractGCDType(Parts, GCDTy, MO.getReg());
+  assert(TypeIdx == 0 && "Bad type index");
+  if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
+      (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
+    return UnableToLegalize;
 
-  // Build the requested new merge, padding with undef.
-  LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts,
-                                  TargetOpcode::G_ANYEXT);
+  // This is most likely SrcTy (smaller then register size) packed in DstTy
+  // (larger then register size) and since merge was not combined it will be
+  // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
+  // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
 
-  // Pack into the original result register.
-  buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
+  // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
+  //
+  // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
+  // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
+  // %0:_(DstTy)  = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
+  SmallVector<Register, 8> NarrowTyElts;
+  unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
+  unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
+  unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
+  for (unsigned i = 0; i < NumParts; ++i) {
+    SmallVector<Register, 8> Sources;
+    for (unsigned j = 0; j < NumElts; ++j)
+      Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
+    NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Sources).getReg(0));
+  }
 
+  MIRBuilder.buildMerge(DstReg, NarrowTyElts);
   MI.eraseFromParent();
   return Legalized;
 }
@@ -4218,163 +4140,14 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::reduceOperationWidth(MachineInstr &MI, unsigned int TypeIdx,
-                                      LLT NarrowTy) {
-  assert(TypeIdx == 0 && "only one type index expected");
-
-  const unsigned Opc = MI.getOpcode();
-  const int NumDefOps = MI.getNumExplicitDefs();
-  const int NumSrcOps = MI.getNumOperands() - NumDefOps;
-  const unsigned Flags = MI.getFlags();
-  const unsigned NarrowSize = NarrowTy.getSizeInBits();
-  const LLT NarrowScalarTy = LLT::scalar(NarrowSize);
-
-  assert(MI.getNumOperands() <= 4 && "expected instruction with either 1 "
-                                     "result and 1-3 sources or 2 results and "
-                                     "1-2 sources");
-
-  SmallVector<Register, 2> DstRegs;
-  for (int I = 0; I < NumDefOps; ++I)
-    DstRegs.push_back(MI.getOperand(I).getReg());
-
-  // First of all check whether we are narrowing (changing the element type)
-  // or reducing the vector elements
-  const LLT DstTy = MRI.getType(DstRegs[0]);
-  const bool IsNarrow = NarrowTy.getScalarType() != DstTy.getScalarType();
-
-  SmallVector<Register, 8> ExtractedRegs[3];
-  SmallVector<Register, 8> Parts;
-
-  // Break down all the sources into NarrowTy pieces we can operate on. This may
-  // involve creating merges to a wider type, padded with undef.
-  for (int I = 0; I != NumSrcOps; ++I) {
-    Register SrcReg = MI.getOperand(I + NumDefOps).getReg();
-    LLT SrcTy = MRI.getType(SrcReg);
-
-    // The type to narrow SrcReg to. For narrowing, this is a smaller scalar.
-    // For fewerElements, this is a smaller vector with the same element type.
-    LLT OpNarrowTy;
-    if (IsNarrow) {
-      OpNarrowTy = NarrowScalarTy;
-
-      // In case of narrowing, we need to cast vectors to scalars for this to
-      // work properly
-      // FIXME: Can we do without the bitcast here if we're narrowing?
-      if (SrcTy.isVector()) {
-        SrcTy = LLT::scalar(SrcTy.getSizeInBits());
-        SrcReg = MIRBuilder.buildBitcast(SrcTy, SrcReg).getReg(0);
-      }
-    } else {
-      auto NarrowEC = NarrowTy.isVector() ? NarrowTy.getElementCount()
-                                          : ElementCount::getFixed(1);
-      OpNarrowTy = LLT::scalarOrVector(NarrowEC, SrcTy.getScalarType());
-    }
-
-    LLT GCDTy = extractGCDType(ExtractedRegs[I], SrcTy, OpNarrowTy, SrcReg);
-
-    // Build a sequence of NarrowTy pieces in ExtractedRegs for this operand.
-    buildLCMMergePieces(SrcTy, OpNarrowTy, GCDTy, ExtractedRegs[I],
-                        TargetOpcode::G_ANYEXT);
-  }
-
-  SmallVector<Register, 8> ResultRegs[2];
-
-  // Input operands for each sub-instruction.
-  SmallVector<SrcOp, 4> InputRegs(NumSrcOps, Register());
-
-  int NumParts = ExtractedRegs[0].size();
-  const unsigned DstSize = DstTy.getSizeInBits();
-  const LLT DstScalarTy = LLT::scalar(DstSize);
-
-  // Narrowing needs to use scalar types
-  LLT DstLCMTy, NarrowDstTy;
-  if (IsNarrow) {
-    DstLCMTy = getLCMType(DstScalarTy, NarrowScalarTy);
-    NarrowDstTy = NarrowScalarTy;
-  } else {
-    DstLCMTy = getLCMType(DstTy, NarrowTy);
-    NarrowDstTy = NarrowTy;
-  }
-
-  // We widened the source registers to satisfy merge/unmerge size
-  // constraints. We'll have some extra fully undef parts.
-  const int NumRealParts = (DstSize + NarrowSize - 1) / NarrowSize;
-
-  for (int I = 0; I != NumRealParts; ++I) {
-    // Emit this instruction on each of the split pieces.
-    for (int J = 0; J != NumSrcOps; ++J)
-      InputRegs[J] = ExtractedRegs[J][I];
-
-    MachineInstrBuilder Inst;
-    if (NumDefOps == 1)
-      Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy}, InputRegs, Flags);
-    else
-      Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy, NarrowDstTy}, InputRegs,
-                                   Flags);
-
-    for (int J = 0; J != NumDefOps; ++J)
-      ResultRegs[J].push_back(Inst.getReg(J));
-  }
-
-  // Fill out the widened result with undef instead of creating instructions
-  // with undef inputs.
-  int NumUndefParts = NumParts - NumRealParts;
-  if (NumUndefParts != 0) {
-    Register Undef = MIRBuilder.buildUndef(NarrowDstTy).getReg(0);
-    for (int I = 0; I != NumDefOps; ++I)
-      ResultRegs[I].append(NumUndefParts, Undef);
-  }
-
-  // Extract the possibly padded result. Use a scratch register if we need to do
-  // a final bitcast, otherwise use the original result register.
-  Register MergeDstReg;
-  for (int I = 0; I != NumDefOps; ++I) {
-    if (IsNarrow && DstTy.isVector())
-      MergeDstReg = MRI.createGenericVirtualRegister(DstScalarTy);
-    else
-      MergeDstReg = DstRegs[I];
-
-    buildWidenedRemergeToDst(MergeDstReg, DstLCMTy, ResultRegs[I]);
-
-    // Recast to vector if we narrowed a vector
-    if (IsNarrow && DstTy.isVector())
-      MIRBuilder.buildBitcast(DstRegs[I], MergeDstReg);
-  }
-
-  MI.eraseFromParent();
-  return Legalized;
-}
-
-LegalizerHelper::LegalizeResult
-LegalizerHelper::fewerElementsVectorSextInReg(MachineInstr &MI, unsigned TypeIdx,
-                                              LLT NarrowTy) {
-  Register DstReg = MI.getOperand(0).getReg();
-  Register SrcReg = MI.getOperand(1).getReg();
-  int64_t Imm = MI.getOperand(2).getImm();
-
-  LLT DstTy = MRI.getType(DstReg);
-
-  SmallVector<Register, 8> Parts;
-  LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
-  LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts);
-
-  for (Register &R : Parts)
-    R = MIRBuilder.buildSExtInReg(NarrowTy, R, Imm).getReg(0);
-
-  buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
-
-  MI.eraseFromParent();
-  return Legalized;
-}
-
-LegalizerHelper::LegalizeResult
 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
                                      LLT NarrowTy) {
   using namespace TargetOpcode;
+  GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
+  unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
 
   switch (MI.getOpcode()) {
   case G_IMPLICIT_DEF:
-    return fewerElementsVectorImplicitDef(MI, TypeIdx, NarrowTy);
   case G_TRUNC:
   case G_AND:
   case G_OR:
@@ -4439,10 +4212,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_SSUBSAT:
   case G_UADDSAT:
   case G_USUBSAT:
-    return reduceOperationWidth(MI, TypeIdx, NarrowTy);
   case G_UMULO:
   case G_SMULO:
-    return fewerElementsVectorMulo(MI, TypeIdx, NarrowTy);
   case G_SHL:
   case G_LSHR:
   case G_ASHR:
@@ -4454,7 +4225,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_CTTZ_ZERO_UNDEF:
   case G_CTPOP:
   case G_FCOPYSIGN:
-    return fewerElementsVectorMultiEltType(MI, TypeIdx, NarrowTy);
   case G_ZEXT:
   case G_SEXT:
   case G_ANYEXT:
@@ -4467,14 +4237,16 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_INTTOPTR:
   case G_PTRTOINT:
   case G_ADDRSPACE_CAST:
-    return fewerElementsVectorCasts(MI, TypeIdx, NarrowTy);
+    return fewerElementsVectorMultiEltType(GMI, NumElts);
   case G_ICMP:
   case G_FCMP:
-    return fewerElementsVectorCmp(MI, TypeIdx, NarrowTy);
+    return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
   case G_SELECT:
-    return fewerElementsVectorSelect(MI, TypeIdx, NarrowTy);
+    if (MRI.getType(MI.getOperand(1).getReg()).isVector())
+      return fewerElementsVectorMultiEltType(GMI, NumElts);
+    return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
   case G_PHI:
-    return fewerElementsVectorPhi(MI, TypeIdx, NarrowTy);
+    return fewerElementsVectorPhi(GMI, NumElts);
   case G_UNMERGE_VALUES:
     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
   case G_BUILD_VECTOR:
@@ -4491,7 +4263,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_STORE:
     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
   case G_SEXT_INREG:
-    return fewerElementsVectorSextInReg(MI, TypeIdx, NarrowTy);
+    return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
   GISEL_VECREDUCE_CASES_NONSEQ
     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
   case G_SHUFFLE_VECTOR:
@@ -5053,6 +4825,15 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case TargetOpcode::G_AND:
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR:
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_UADDSAT:
+  case TargetOpcode::G_USUBSAT:
+  case TargetOpcode::G_SADDSAT:
+  case TargetOpcode::G_SSUBSAT:
   case TargetOpcode::G_SMIN:
   case TargetOpcode::G_SMAX:
   case TargetOpcode::G_UMIN:
@@ -5070,6 +4851,17 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_FMA:
+  case TargetOpcode::G_FSHR:
+  case TargetOpcode::G_FSHL: {
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, MoreTy, 1);
+    moreElementsVectorSrc(MI, MoreTy, 2);
+    moreElementsVectorSrc(MI, MoreTy, 3);
+    moreElementsVectorDst(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   case TargetOpcode::G_EXTRACT:
     if (TypeIdx != 1)
       return UnableToLegalize;
@@ -5079,6 +4871,11 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     return Legalized;
   case TargetOpcode::G_INSERT:
   case TargetOpcode::G_FREEZE:
+  case TargetOpcode::G_FNEG:
+  case TargetOpcode::G_FABS:
+  case TargetOpcode::G_BSWAP:
+  case TargetOpcode::G_FCANONICALIZE:
+  case TargetOpcode::G_SEXT_INREG:
     if (TypeIdx != 0)
       return UnableToLegalize;
     Observer.changingInstr(MI);
@@ -5098,30 +4895,34 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     moreElementsVectorDst(MI, MoreTy, 0);
     Observer.changedInstr(MI);
     return Legalized;
-  case TargetOpcode::G_UNMERGE_VALUES: {
-    if (TypeIdx != 1)
-      return UnableToLegalize;
-
-    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
-    int NumDst = MI.getNumOperands() - 1;
-    moreElementsVectorSrc(MI, MoreTy, NumDst);
-
-    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
-    for (int I = 0; I != NumDst; ++I)
-      MIB.addDef(MI.getOperand(I).getReg());
+  case TargetOpcode::G_UNMERGE_VALUES:
+    return UnableToLegalize;
+  case TargetOpcode::G_PHI:
+    return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
+  case TargetOpcode::G_SHUFFLE_VECTOR:
+    return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
+  case TargetOpcode::G_BUILD_VECTOR: {
+    SmallVector<SrcOp, 8> Elts;
+    for (auto Op : MI.uses()) {
+      Elts.push_back(Op.getReg());
+    }
 
-    int NewNumDst = MoreTy.getSizeInBits() / DstTy.getSizeInBits();
-    for (int I = NumDst; I != NewNumDst; ++I)
-      MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
+    for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
+      Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
+    }
 
-    MIB.addUse(MI.getOperand(NumDst).getReg());
+    MIRBuilder.buildDeleteTrailingVectorElements(
+        MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
     MI.eraseFromParent();
     return Legalized;
   }
-  case TargetOpcode::G_PHI:
-    return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
-  case TargetOpcode::G_SHUFFLE_VECTOR:
-    return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
+  case TargetOpcode::G_TRUNC: {
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, MoreTy, 1);
+    moreElementsVectorDst(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   default:
     return UnableToLegalize;
   }
@@ -6778,6 +6579,24 @@ LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
 
   LLT VecTy = MRI.getType(SrcVec);
   LLT EltTy = VecTy.getElementType();
+  unsigned NumElts = VecTy.getNumElements();
+
+  int64_t IdxVal;
+  if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
+    SmallVector<Register, 8> SrcRegs;
+    extractParts(SrcVec, EltTy, NumElts, SrcRegs);
+
+    if (InsertVal) {
+      SrcRegs[IdxVal] = MI.getOperand(2).getReg();
+      MIRBuilder.buildMerge(DstReg, SrcRegs);
+    } else {
+      MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
   if (!EltTy.isByteSized()) { // Not implemented.
     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
     return UnableToLegalize;
@@ -6796,7 +6615,6 @@ LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
   // if the index is out of bounds.
   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
 
-  int64_t IdxVal;
   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
     int64_t Offset = IdxVal * EltBytes;
     PtrInfo = PtrInfo.getWithOffset(Offset);
@@ -6923,6 +6741,32 @@ LegalizerHelper::lowerExtract(MachineInstr &MI) {
   LLT DstTy = MRI.getType(Dst);
   LLT SrcTy = MRI.getType(Src);
 
+  // Extract sub-vector or one element
+  if (SrcTy.isVector()) {
+    unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
+    unsigned DstSize = DstTy.getSizeInBits();
+
+    if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
+        (Offset + DstSize <= SrcTy.getSizeInBits())) {
+      // Unmerge and allow access to each Src element for the artifact combiner.
+      auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), Src);
+
+      // Take element(s) we need to extract and copy it (merge them).
+      SmallVector<Register, 8> SubVectorElts;
+      for (unsigned Idx = Offset / SrcEltSize;
+           Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
+        SubVectorElts.push_back(Unmerge.getReg(Idx));
+      }
+      if (SubVectorElts.size() == 1)
+        MIRBuilder.buildCopy(Dst, SubVectorElts[0]);
+      else
+        MIRBuilder.buildMerge(Dst, SubVectorElts);
+
+      MI.eraseFromParent();
+      return Legalized;
+    }
+  }
+
   if (DstTy.isScalar() &&
       (SrcTy.isScalar() ||
        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
@@ -6956,6 +6800,45 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
   LLT DstTy = MRI.getType(Src);
   LLT InsertTy = MRI.getType(InsertSrc);
 
+  // Insert sub-vector or one element
+  if (DstTy.isVector() && !InsertTy.isPointer()) {
+    LLT EltTy = DstTy.getElementType();
+    unsigned EltSize = EltTy.getSizeInBits();
+    unsigned InsertSize = InsertTy.getSizeInBits();
+
+    if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
+        (Offset + InsertSize <= DstTy.getSizeInBits())) {
+      auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
+      SmallVector<Register, 8> DstElts;
+      unsigned Idx = 0;
+      // Elements from Src before insert start Offset
+      for (; Idx < Offset / EltSize; ++Idx) {
+        DstElts.push_back(UnmergeSrc.getReg(Idx));
+      }
+
+      // Replace elements in Src with elements from InsertSrc
+      if (InsertTy.getSizeInBits() > EltSize) {
+        auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
+        for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
+             ++Idx, ++i) {
+          DstElts.push_back(UnmergeInsertSrc.getReg(i));
+        }
+      } else {
+        DstElts.push_back(InsertSrc);
+        ++Idx;
+      }
+
+      // Remaining elements from Src after insert
+      for (; Idx < DstTy.getNumElements(); ++Idx) {
+        DstElts.push_back(UnmergeSrc.getReg(Idx));
+      }
+
+      MIRBuilder.buildMerge(Dst, DstElts);
+      MI.eraseFromParent();
+      return Legalized;
+    }
+  }
+
   if (InsertTy.isVector() ||
       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
     return UnableToLegalize;
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index 03dda806cb1e..de8dbd456901 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -554,12 +554,11 @@ bool LoadStoreOpt::mergeBlockStores(MachineBasicBlock &MBB) {
   bool Changed = false;
   // Walk through the block bottom-up, looking for merging candidates.
   StoreMergeCandidate Candidate;
-  for (auto II = MBB.rbegin(), IE = MBB.rend(); II != IE; ++II) {
-    MachineInstr &MI = *II;
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
     if (InstsToErase.contains(&MI))
       continue;
 
-    if (auto StoreMI = dyn_cast<GStore>(&*II)) {
+    if (auto *StoreMI = dyn_cast<GStore>(&MI)) {
       // We have a G_STORE. Add it to the candidate if it writes to an adjacent
       // address.
       if (!addStoreToCandidate(*StoreMI, Candidate)) {
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index fb5ed35c1f72..391251886fbb 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -215,6 +215,48 @@ MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res,
   return buildPtrMask(Res, Op0, MaskReg);
 }
 
+MachineInstrBuilder
+MachineIRBuilder::buildPadVectorWithUndefElements(const DstOp &Res,
+                                                  const SrcOp &Op0) {
+  LLT ResTy = Res.getLLTTy(*getMRI());
+  LLT Op0Ty = Op0.getLLTTy(*getMRI());
+
+  assert((ResTy.isVector() && Op0Ty.isVector()) && "Non vector type");
+  assert((ResTy.getElementType() == Op0Ty.getElementType()) &&
+         "Different vector element types");
+  assert((ResTy.getNumElements() > Op0Ty.getNumElements()) &&
+         "Op0 has more elements");
+
+  auto Unmerge = buildUnmerge(Op0Ty.getElementType(), Op0);
+  SmallVector<Register, 8> Regs;
+  for (auto Op : Unmerge.getInstr()->defs())
+    Regs.push_back(Op.getReg());
+  Register Undef = buildUndef(Op0Ty.getElementType()).getReg(0);
+  unsigned NumberOfPadElts = ResTy.getNumElements() - Regs.size();
+  for (unsigned i = 0; i < NumberOfPadElts; ++i)
+    Regs.push_back(Undef);
+  return buildMerge(Res, Regs);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildDeleteTrailingVectorElements(const DstOp &Res,
+                                                    const SrcOp &Op0) {
+  LLT ResTy = Res.getLLTTy(*getMRI());
+  LLT Op0Ty = Op0.getLLTTy(*getMRI());
+
+  assert((ResTy.isVector() && Op0Ty.isVector()) && "Non vector type");
+  assert((ResTy.getElementType() == Op0Ty.getElementType()) &&
+         "Different vector element types");
+  assert((ResTy.getNumElements() < Op0Ty.getNumElements()) &&
+         "Op0 has fewer elements");
+
+  SmallVector<Register, 8> Regs;
+  auto Unmerge = buildUnmerge(Op0Ty.getElementType(), Op0);
+  for (unsigned i = 0; i < ResTy.getNumElements(); ++i)
+    Regs.push_back(Unmerge.getReg(i));
+  return buildMerge(Res, Regs);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) {
   return buildInstr(TargetOpcode::G_BR).addMBB(&Dest);
 }
@@ -613,10 +655,8 @@ MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<LLT> Res,
 MachineInstrBuilder MachineIRBuilder::buildUnmerge(LLT Res,
                                                    const SrcOp &Op) {
   unsigned NumReg = Op.getLLTTy(*getMRI()).getSizeInBits() / Res.getSizeInBits();
-  SmallVector<Register, 8> TmpVec;
-  for (unsigned I = 0; I != NumReg; ++I)
-    TmpVec.push_back(getMRI()->createGenericVirtualRegister(Res));
-  return buildUnmerge(TmpVec, Op);
+  SmallVector<DstOp, 8> TmpVec(NumReg, Res);
+  return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<Register> Res,
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index b0b84763e922..4981a537dc7c 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -923,6 +923,21 @@ LLT llvm::getLCMType(LLT OrigTy, LLT TargetTy) {
   return LLT::scalar(LCMSize);
 }
 
+LLT llvm::getCoverTy(LLT OrigTy, LLT TargetTy) {
+  if (!OrigTy.isVector() || !TargetTy.isVector() || OrigTy == TargetTy ||
+      (OrigTy.getScalarSizeInBits() != TargetTy.getScalarSizeInBits()))
+    return getLCMType(OrigTy, TargetTy);
+
+  unsigned OrigTyNumElts = OrigTy.getNumElements();
+  unsigned TargetTyNumElts = TargetTy.getNumElements();
+  if (OrigTyNumElts % TargetTyNumElts == 0)
+    return OrigTy;
+
+  unsigned NumElts = alignTo(OrigTyNumElts, TargetTyNumElts);
+  return LLT::scalarOrVector(ElementCount::getFixed(NumElts),
+                             OrigTy.getElementType());
+}
+
 LLT llvm::getGCDType(LLT OrigTy, LLT TargetTy) {
   const unsigned OrigSize = OrigTy.getSizeInBits();
   const unsigned TargetSize = TargetTy.getSizeInBits();
@@ -1184,25 +1199,6 @@ bool llvm::shouldOptForSize(const MachineBasicBlock &MBB,
          llvm::shouldOptimizeForSize(MBB.getBasicBlock(), PSI, BFI);
 }
 
-/// These artifacts generally don't have any debug users because they don't
-/// directly originate from IR instructions, but instead usually from
-/// legalization. Avoiding checking for debug users improves compile time.
-/// Note that truncates or extends aren't included because they have IR
-/// counterparts which can have debug users after translation.
-static bool shouldSkipDbgValueFor(MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  case TargetOpcode::G_UNMERGE_VALUES:
-  case TargetOpcode::G_MERGE_VALUES:
-  case TargetOpcode::G_CONCAT_VECTORS:
-  case TargetOpcode::G_BUILD_VECTOR:
-  case TargetOpcode::G_EXTRACT:
-  case TargetOpcode::G_INSERT:
-    return true;
-  default:
-    return false;
-  }
-}
-
 void llvm::saveUsesAndErase(MachineInstr &MI, MachineRegisterInfo &MRI,
                             LostDebugLocObserver *LocObserver,
                             SmallInstListTy &DeadInstChain) {
@@ -1212,10 +1208,7 @@ void llvm::saveUsesAndErase(MachineInstr &MI, MachineRegisterInfo &MRI,
   }
   LLVM_DEBUG(dbgs() << MI << "Is dead; erasing.\n");
   DeadInstChain.remove(&MI);
-  if (shouldSkipDbgValueFor(MI))
-    MI.eraseFromParent();
-  else
-    MI.eraseFromParentAndMarkDBGValuesForRemoval();
+  MI.eraseFromParent();
   if (LocObserver)
     LocObserver->checkpoint(false);
 }
diff --git a/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/llvm/lib/CodeGen/ImplicitNullChecks.cpp
index 0882ce366c9c..fc97938ccd3e 100644
--- a/llvm/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/llvm/lib/CodeGen/ImplicitNullChecks.cpp
@@ -242,7 +242,7 @@ bool ImplicitNullChecks::canHandle(const MachineInstr *MI) {
   auto IsRegMask = [](const MachineOperand &MO) { return MO.isRegMask(); };
   (void)IsRegMask;
 
-  assert(!llvm::any_of(MI->operands(), IsRegMask) &&
+  assert(llvm::none_of(MI->operands(), IsRegMask) &&
          "Calls were filtered out above!");
 
   auto IsUnordered = [](MachineMemOperand *MMO) { return MMO->isUnordered(); };
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index fc5ac45752ca..c975013db8c8 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -686,9 +686,7 @@ void InlineSpiller::reMaterializeAll() {
   // Remove any values that were completely rematted.
   for (Register Reg : RegsToSpill) {
     LiveInterval &LI = LIS.getInterval(Reg);
-    for (LiveInterval::vni_iterator I = LI.vni_begin(), E = LI.vni_end();
-         I != E; ++I) {
-      VNInfo *VNI = *I;
+    for (VNInfo *VNI : llvm::make_range(LI.vni_begin(), LI.vni_end())) {
       if (VNI->isUnused() || VNI->isPHIDef() || UsedValues.count(VNI))
         continue;
       MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
diff --git a/llvm/lib/CodeGen/InterferenceCache.cpp b/llvm/lib/CodeGen/InterferenceCache.cpp
index a56485cdbc67..3cab9e5734ee 100644
--- a/llvm/lib/CodeGen/InterferenceCache.cpp
+++ b/llvm/lib/CodeGen/InterferenceCache.cpp
@@ -56,8 +56,8 @@ void InterferenceCache::init(MachineFunction *mf,
   LIUArray = liuarray;
   TRI = tri;
   reinitPhysRegEntries();
-  for (unsigned i = 0; i != CacheEntries; ++i)
-    Entries[i].clear(mf, indexes, lis);
+  for (Entry &E : Entries)
+    E.clear(mf, indexes, lis);
 }
 
 InterferenceCache::Entry *InterferenceCache::get(MCRegister PhysReg) {
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index cf62b0e5d7e8..e97dcca201e8 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -1249,8 +1249,8 @@ bool InstrRefBasedLDV::transferDebugPHI(MachineInstr &MI) {
     std::array<unsigned, 4> CandidateSizes = {64, 32, 16, 8};
     Optional<ValueIDNum> Result = None;
     Optional<LocIdx> SpillLoc = None;
-    for (unsigned int I = 0; I < CandidateSizes.size(); ++I) {
-      unsigned SpillID = MTracker->getLocID(SpillNo, {CandidateSizes[I], 0});
+    for (unsigned CS : CandidateSizes) {
+      unsigned SpillID = MTracker->getLocID(SpillNo, {CS, 0});
       SpillLoc = MTracker->getSpillMLoc(SpillID);
       ValueIDNum Val = MTracker->readMLoc(*SpillLoc);
       // If this value was defined in it's own position, then it was probably
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index a632d3d9ce76..b4dd41bbb810 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -492,10 +492,10 @@ private:
     static VarLoc CreateCopyLoc(const VarLoc &OldVL, const MachineLoc &OldML,
                                 Register NewReg) {
       VarLoc VL = OldVL;
-      for (size_t I = 0, E = VL.Locs.size(); I < E; ++I)
-        if (VL.Locs[I] == OldML) {
-          VL.Locs[I].Kind = MachineLocKind::RegisterKind;
-          VL.Locs[I].Value.RegNo = NewReg;
+      for (MachineLoc &ML : VL.Locs)
+        if (ML == OldML) {
+          ML.Kind = MachineLocKind::RegisterKind;
+          ML.Value.RegNo = NewReg;
           return VL;
         }
       llvm_unreachable("Should have found OldML in new VarLoc.");
@@ -506,10 +506,10 @@ private:
     static VarLoc CreateSpillLoc(const VarLoc &OldVL, const MachineLoc &OldML,
                                  unsigned SpillBase, StackOffset SpillOffset) {
       VarLoc VL = OldVL;
-      for (int I = 0, E = VL.Locs.size(); I < E; ++I)
-        if (VL.Locs[I] == OldML) {
-          VL.Locs[I].Kind = MachineLocKind::SpillLocKind;
-          VL.Locs[I].Value.SpillLocation = {SpillBase, SpillOffset};
+      for (MachineLoc &ML : VL.Locs)
+        if (ML == OldML) {
+          ML.Kind = MachineLocKind::SpillLocKind;
+          ML.Value.SpillLocation = {SpillBase, SpillOffset};
           return VL;
         }
       llvm_unreachable("Should have found OldML in new VarLoc.");
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 5f976bf43c5b..e6661e5135c3 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -822,9 +822,6 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   // register that hasn't been defined yet. If we do not remove those here, then
   // the re-insertion of the DBG_VALUE instruction after register allocation
   // will be incorrect.
-  // TODO: If earlier passes are corrected to generate sane debug information
-  // (and if the machine verifier is improved to catch this), then these checks
-  // could be removed or replaced by asserts.
   bool Discard = false;
   for (const MachineOperand &Op : MI.debug_operands()) {
     if (Op.isReg() && Register::isVirtualRegister(Op.getReg())) {
@@ -1341,8 +1338,8 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<Register> NewRegs,
   bool DidChange = false;
   LocMap::iterator LocMapI;
   LocMapI.setMap(locInts);
-  for (unsigned i = 0; i != NewRegs.size(); ++i) {
-    LiveInterval *LI = &LIS.getInterval(NewRegs[i]);
+  for (Register NewReg : NewRegs) {
+    LiveInterval *LI = &LIS.getInterval(NewReg);
     if (LI->empty())
       continue;
 
@@ -1500,8 +1497,8 @@ void LDVImpl::splitRegister(Register OldReg, ArrayRef<Register> NewRegs) {
 
   // Map all of the new virtual registers.
   UserValue *UV = lookupVirtReg(OldReg);
-  for (unsigned i = 0; i != NewRegs.size(); ++i)
-    mapVirtReg(NewRegs[i], UV);
+  for (Register NewReg : NewRegs)
+    mapVirtReg(NewReg, UV);
 }
 
 void LiveDebugVariables::
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.h b/llvm/lib/CodeGen/LiveDebugVariables.h
index 07dd3a83866f..9998ce9e8dad 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.h
+++ b/llvm/lib/CodeGen/LiveDebugVariables.h
@@ -56,6 +56,11 @@ private:
   bool runOnMachineFunction(MachineFunction &) override;
   void releaseMemory() override;
   void getAnalysisUsage(AnalysisUsage &) const override;
+
+  MachineFunctionProperties getSetProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::TracksDebugUserValues);
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 6380c4bfd6e6..05768140cbdf 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -133,6 +133,22 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
 
     if (OVNI != li.getVNInfoAt(UseIdx))
       return false;
+
+    // Check that subrange is live at UseIdx.
+    if (MO.getSubReg()) {
+      const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+      LaneBitmask LM = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+      for (LiveInterval::SubRange &SR : li.subranges()) {
+        if ((SR.LaneMask & LM).none())
+          continue;
+        if (!SR.liveAt(UseIdx))
+          return false;
+        // Early exit if all used lanes are checked. No need to continue.
+        LM &= ~SR.LaneMask;
+        if (LM.none())
+          break;
+      }
+    }
   }
   return true;
 }
diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp
index e8744797707b..94bdfab5e5e0 100644
--- a/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/llvm/lib/CodeGen/LiveVariables.cpp
@@ -141,8 +141,8 @@ void LiveVariables::HandleVirtRegUse(Register Reg, MachineBasicBlock *MBB,
   }
 
 #ifndef NDEBUG
-  for (unsigned i = 0, e = VRInfo.Kills.size(); i != e; ++i)
-    assert(VRInfo.Kills[i]->getParent() != MBB && "entry should be at end!");
+  for (MachineInstr *Kill : VRInfo.Kills)
+    assert(Kill->getParent() != MBB && "entry should be at end!");
 #endif
 
   // This situation can occur:
@@ -534,8 +534,7 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
 
   MachineBasicBlock *MBB = MI.getParent();
   // Process all uses.
-  for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
-    unsigned MOReg = UseRegs[i];
+  for (unsigned MOReg : UseRegs) {
     if (Register::isVirtualRegister(MOReg))
       HandleVirtRegUse(MOReg, MBB, MI);
     else if (!MRI->isReserved(MOReg))
@@ -543,12 +542,11 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
   }
 
   // Process all masked registers. (Call clobbers).
-  for (unsigned i = 0, e = RegMasks.size(); i != e; ++i)
-    HandleRegMask(MI.getOperand(RegMasks[i]));
+  for (unsigned Mask : RegMasks)
+    HandleRegMask(MI.getOperand(Mask));
 
   // Process all defs.
-  for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
-    unsigned MOReg = DefRegs[i];
+  for (unsigned MOReg : DefRegs) {
     if (Register::isVirtualRegister(MOReg))
       HandleVirtRegDef(MOReg, MI);
     else if (!MRI->isReserved(MOReg))
diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index ee2387d1e8e6..37fd3e4853ac 100644
--- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -210,7 +210,11 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
     StackObjSet SmallArrayObjs;
     StackObjSet AddrOfObjs;
 
-    AdjustStackOffset(MFI, StackProtectorFI, Offset, StackGrowsDown, MaxAlign);
+    // Only place the stack protector in the local stack area if the target
+    // allows it.
+    if (TFI.isStackIdSafeForLocalArea(MFI.getStackID(StackProtectorFI)))
+      AdjustStackOffset(MFI, StackProtectorFI, Offset, StackGrowsDown,
+                        MaxAlign);
 
     // Assign large stack objects first.
     for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 6221b5929301..d0323eaf3d78 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -350,18 +350,33 @@ void MIRParserImpl::computeFunctionProperties(MachineFunction &MF) {
 
   bool HasPHI = false;
   bool HasInlineAsm = false;
+  bool AllTiedOpsRewritten = true, HasTiedOps = false;
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
       if (MI.isPHI())
         HasPHI = true;
       if (MI.isInlineAsm())
         HasInlineAsm = true;
+      for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+        const MachineOperand &MO = MI.getOperand(I);
+        if (!MO.isReg() || !MO.getReg())
+          continue;
+        unsigned DefIdx;
+        if (MO.isUse() && MI.isRegTiedToDefOperand(I, &DefIdx)) {
+          HasTiedOps = true;
+          if (MO.getReg() != MI.getOperand(DefIdx).getReg())
+            AllTiedOpsRewritten = false;
+        }
+      }
     }
   }
   if (!HasPHI)
     Properties.set(MachineFunctionProperties::Property::NoPHIs);
   MF.setHasInlineAsm(HasInlineAsm);
 
+  if (HasTiedOps && AllTiedOpsRewritten)
+    Properties.set(MachineFunctionProperties::Property::TiedOpsRewritten);
+
   if (isSSA(MF))
     Properties.set(MachineFunctionProperties::Property::IsSSA);
   else
@@ -457,6 +472,9 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   if (YamlMF.FailsVerification)
     MF.getProperties().set(
         MachineFunctionProperties::Property::FailsVerification);
+  if (YamlMF.TracksDebugUserValues)
+    MF.getProperties().set(
+        MachineFunctionProperties::Property::TracksDebugUserValues);
 
   PerFunctionMIParsingState PFS(MF, SM, IRSlots, *Target);
   if (parseRegisterInfo(PFS, YamlMF))
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index f1369396e37f..dc72f83ad0e4 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -219,6 +219,8 @@ void MIRPrinter::print(const MachineFunction &MF) {
       MachineFunctionProperties::Property::FailedISel);
   YamlMF.FailsVerification = MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::FailsVerification);
+  YamlMF.TracksDebugUserValues = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::TracksDebugUserValues);
 
   convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
   MachineModuleSlotTracker MST(&MF);
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 23c511aaa056..8c9d00d08c6a 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -193,7 +193,7 @@ void ilist_traits<MachineInstr>::transferNodesFromList(ilist_traits &FromList,
 
 void ilist_traits<MachineInstr>::deleteNode(MachineInstr *MI) {
   assert(!MI->getParent() && "MI is still in a block!");
-  Parent->getParent()->DeleteMachineInstr(MI);
+  Parent->getParent()->deleteMachineInstr(MI);
 }
 
 MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {
@@ -1038,16 +1038,15 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
   // Collect a list of virtual registers killed by the terminators.
   SmallVector<Register, 4> KilledRegs;
   if (LV)
-    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
-         I != E; ++I) {
-      MachineInstr *MI = &*I;
-      for (MachineOperand &MO : MI->operands()) {
+    for (MachineInstr &MI :
+         llvm::make_range(getFirstInstrTerminator(), instr_end())) {
+      for (MachineOperand &MO : MI.operands()) {
         if (!MO.isReg() || MO.getReg() == 0 || !MO.isUse() || !MO.isKill() ||
             MO.isUndef())
           continue;
         Register Reg = MO.getReg();
         if (Register::isPhysicalRegister(Reg) ||
-            LV->getVarInfo(Reg).removeKill(*MI)) {
+            LV->getVarInfo(Reg).removeKill(MI)) {
           KilledRegs.push_back(Reg);
           LLVM_DEBUG(dbgs() << "Removing terminator kill: " << MI);
           MO.setIsKill(false);
@@ -1057,11 +1056,9 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
 
   SmallVector<Register, 4> UsedRegs;
   if (LIS) {
-    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
-         I != E; ++I) {
-      MachineInstr *MI = &*I;
-
-      for (const MachineOperand &MO : MI->operands()) {
+    for (MachineInstr &MI :
+         llvm::make_range(getFirstInstrTerminator(), instr_end())) {
+      for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isReg() || MO.getReg() == 0)
           continue;
 
@@ -1078,9 +1075,9 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
   // SlotIndexes.
   SmallVector<MachineInstr*, 4> Terminators;
   if (Indexes) {
-    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
-         I != E; ++I)
-      Terminators.push_back(&*I);
+    for (MachineInstr &MI :
+         llvm::make_range(getFirstInstrTerminator(), instr_end()))
+      Terminators.push_back(&MI);
   }
 
   // Since we replaced all uses of Succ with NMBB, that should also be treated
@@ -1091,9 +1088,9 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
 
   if (Indexes) {
     SmallVector<MachineInstr*, 4> NewTerminators;
-    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
-         I != E; ++I)
-      NewTerminators.push_back(&*I);
+    for (MachineInstr &MI :
+         llvm::make_range(getFirstInstrTerminator(), instr_end()))
+      NewTerminators.push_back(&MI);
 
     for (MachineInstr *Terminator : Terminators) {
       if (!is_contained(NewTerminators, Terminator))
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 8a1b4031642d..692587cd58fa 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -61,6 +61,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -193,6 +194,11 @@ static cl::opt<unsigned> TriangleChainCount(
     cl::init(2),
     cl::Hidden);
 
+static cl::opt<bool> EnableExtTspBlockPlacement(
+    "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
+    cl::desc("Enable machine block placement based on the ext-tsp model, "
+             "optimizing I-cache utilization."));
+
 namespace llvm {
 extern cl::opt<unsigned> StaticLikelyProb;
 extern cl::opt<unsigned> ProfileLikelyProb;
@@ -557,6 +563,15 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// but a local analysis would not find them.
   void precomputeTriangleChains();
 
+  /// Apply a post-processing step optimizing block placement.
+  void applyExtTsp();
+
+  /// Modify the existing block placement in the function and adjust all jumps.
+  void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
+
+  /// Create a single CFG chain from the current block order.
+  void createCFGChainExtTsp();
+
 public:
   static char ID; // Pass identification, replacement for typeid
 
@@ -3387,6 +3402,15 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  // Apply a post-processing optimizing block placement.
+  if (MF.size() >= 3 && EnableExtTspBlockPlacement) {
+    // Find a new placement and modify the layout of the blocks in the function.
+    applyExtTsp();
+
+    // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
+    createCFGChainExtTsp();
+  }
+
   optimizeBranches();
   alignBlocks();
 
@@ -3413,12 +3437,147 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
     MBFI->view("MBP." + MF.getName(), false);
   }
 
-
   // We always return true as we have no way to track whether the final order
   // differs from the original order.
   return true;
 }
 
+void MachineBlockPlacement::applyExtTsp() {
+  // Prepare data; blocks are indexed by their index in the current ordering.
+  DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
+  BlockIndex.reserve(F->size());
+  std::vector<const MachineBasicBlock *> CurrentBlockOrder;
+  CurrentBlockOrder.reserve(F->size());
+  size_t NumBlocks = 0;
+  for (const MachineBasicBlock &MBB : *F) {
+    BlockIndex[&MBB] = NumBlocks++;
+    CurrentBlockOrder.push_back(&MBB);
+  }
+
+  auto BlockSizes = std::vector<uint64_t>(F->size());
+  auto BlockCounts = std::vector<uint64_t>(F->size());
+  DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> JumpCounts;
+  for (MachineBasicBlock &MBB : *F) {
+    // Getting the block frequency.
+    BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
+    BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
+    // Getting the block size:
+    // - approximate the size of an instruction by 4 bytes, and
+    // - ignore debug instructions.
+    // Note: getting the exact size of each block is target-dependent and can be
+    // done by extending the interface of MCCodeEmitter. Experimentally we do
+    // not see a perf improvement with the exact block sizes.
+    auto NonDbgInsts =
+        instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
+    int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
+    BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
+    // Getting jump frequencies.
+    for (MachineBasicBlock *Succ : MBB.successors()) {
+      auto EP = MBPI->getEdgeProbability(&MBB, Succ);
+      BlockFrequency EdgeFreq = BlockFreq * EP;
+      auto Edge = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]);
+      JumpCounts[Edge] = EdgeFreq.getFrequency();
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
+                    << " with profile = " << F->getFunction().hasProfileData()
+                    << " (" << F->getName().str() << ")"
+                    << "\n");
+  LLVM_DEBUG(
+      dbgs() << format("  original  layout score: %0.2f\n",
+                       calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
+
+  // Run the layout algorithm.
+  auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
+  std::vector<const MachineBasicBlock *> NewBlockOrder;
+  NewBlockOrder.reserve(F->size());
+  for (uint64_t Node : NewOrder) {
+    NewBlockOrder.push_back(CurrentBlockOrder[Node]);
+  }
+  LLVM_DEBUG(dbgs() << format("  optimized layout score: %0.2f\n",
+                              calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
+                                              JumpCounts)));
+
+  // Assign new block order.
+  assignBlockOrder(NewBlockOrder);
+}
+
+void MachineBlockPlacement::assignBlockOrder(
+    const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
+  assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
+  F->RenumberBlocks();
+
+  bool HasChanges = false;
+  for (size_t I = 0; I < NewBlockOrder.size(); I++) {
+    if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
+      HasChanges = true;
+      break;
+    }
+  }
+  // Stop early if the new block order is identical to the existing one.
+  if (!HasChanges)
+    return;
+
+  SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
+  for (auto &MBB : *F) {
+    PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
+  }
+
+  // Sort basic blocks in the function according to the computed order.
+  DenseMap<const MachineBasicBlock *, size_t> NewIndex;
+  for (const MachineBasicBlock *MBB : NewBlockOrder) {
+    NewIndex[MBB] = NewIndex.size();
+  }
+  F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
+    return NewIndex[&L] < NewIndex[&R];
+  });
+
+  // Update basic block branches by inserting explicit fallthrough branches
+  // when required and re-optimize branches when possible.
+  const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
+  SmallVector<MachineOperand, 4> Cond;
+  for (auto &MBB : *F) {
+    MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
+    MachineFunction::iterator EndIt = MBB.getParent()->end();
+    auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
+    // If this block had a fallthrough before we need an explicit unconditional
+    // branch to that block if the fallthrough block is not adjacent to the
+    // block in the new order.
+    if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
+      TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
+    }
+
+    // It might be possible to optimize branches by flipping the condition.
+    Cond.clear();
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
+      continue;
+    MBB.updateTerminator(FTMBB);
+  }
+
+#ifndef NDEBUG
+  // Make sure we correctly constructed all branches.
+  F->verify(this, "After optimized block reordering");
+#endif
+}
+
+void MachineBlockPlacement::createCFGChainExtTsp() {
+  BlockToChain.clear();
+  ComputedEdges.clear();
+  ChainAllocator.DestroyAll();
+
+  MachineBasicBlock *HeadBB = &F->front();
+  BlockChain *FunctionChain =
+      new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
+
+  for (MachineBasicBlock &MBB : *F) {
+    if (HeadBB == &MBB)
+      continue; // Ignore head of the chain
+    FunctionChain->merge(&MBB, nullptr);
+  }
+}
+
 namespace {
 
 /// A pass to compute block placement statistics.
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index e2b6cfe55c16..72ab9ee4f388 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -485,7 +485,7 @@ static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
     MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
 
   for (auto *InstrPtr : DelInstrs) {
-    InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
+    InstrPtr->eraseFromParent();
     // Erase all LiveRegs defined by the removed instruction
     for (auto I = RegUnits.begin(); I != RegUnits.end(); ) {
       if (I->MI == InstrPtr)
@@ -693,7 +693,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
         // use for them.
         MachineFunction *MF = MBB->getParent();
         for (auto *InstrPtr : InsInstrs)
-          MF->DeleteMachineInstr(InstrPtr);
+          MF->deleteMachineInstr(InstrPtr);
       }
       InstrIdxForVirtReg.clear();
     }
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 7c83bacd80d9..57fbe4112e47 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -847,31 +847,27 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
   LLVM_DEBUG(dbgs() << "MCP: BackwardCopyPropagateBlock " << MBB.getName()
                     << "\n");
 
-  for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
-       I != E;) {
-    MachineInstr *MI = &*I;
-    ++I;
-
+  for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(MBB))) {
     // Ignore non-trivial COPYs.
-    if (MI->isCopy() && MI->getNumOperands() == 2 &&
-        !TRI->regsOverlap(MI->getOperand(0).getReg(),
-                          MI->getOperand(1).getReg())) {
+    if (MI.isCopy() && MI.getNumOperands() == 2 &&
+        !TRI->regsOverlap(MI.getOperand(0).getReg(),
+                          MI.getOperand(1).getReg())) {
 
-      MCRegister Def = MI->getOperand(0).getReg().asMCReg();
-      MCRegister Src = MI->getOperand(1).getReg().asMCReg();
+      MCRegister Def = MI.getOperand(0).getReg().asMCReg();
+      MCRegister Src = MI.getOperand(1).getReg().asMCReg();
 
       // Unlike forward cp, we don't invoke propagateDefs here,
       // just let forward cp do COPY-to-COPY propagation.
-      if (isBackwardPropagatableCopy(*MI, *MRI)) {
+      if (isBackwardPropagatableCopy(MI, *MRI)) {
         Tracker.invalidateRegister(Src, *TRI);
         Tracker.invalidateRegister(Def, *TRI);
-        Tracker.trackCopy(MI, *TRI);
+        Tracker.trackCopy(&MI, *TRI);
         continue;
       }
     }
 
     // Invalidate any earlyclobber regs first.
-    for (const MachineOperand &MO : MI->operands())
+    for (const MachineOperand &MO : MI.operands())
       if (MO.isReg() && MO.isEarlyClobber()) {
         MCRegister Reg = MO.getReg().asMCReg();
         if (!Reg)
@@ -879,8 +875,8 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
         Tracker.invalidateRegister(Reg, *TRI);
       }
 
-    propagateDefs(*MI);
-    for (const MachineOperand &MO : MI->operands()) {
+    propagateDefs(MI);
+    for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg())
         continue;
 
@@ -898,7 +894,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
           for (MCRegUnitIterator RUI(MO.getReg().asMCReg(), TRI); RUI.isValid();
                ++RUI) {
             if (auto *Copy = Tracker.findCopyDefViaUnit(*RUI, *TRI)) {
-              CopyDbgUsers[Copy].insert(MI);
+              CopyDbgUsers[Copy].insert(&MI);
             }
           }
         } else {
diff --git a/llvm/lib/CodeGen/MachineCycleAnalysis.cpp b/llvm/lib/CodeGen/MachineCycleAnalysis.cpp
new file mode 100644
index 000000000000..42a5e2b7af01
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineCycleAnalysis.cpp
@@ -0,0 +1,113 @@
+//===- MachineCycleAnalysis.cpp - Compute CycleInfo for Machine IR --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/ADT/GenericCycleImpl.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineSSAContext.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+template class llvm::GenericCycleInfo<llvm::MachineSSAContext>;
+template class llvm::GenericCycle<llvm::MachineSSAContext>;
+
+namespace {
+
+/// Legacy analysis pass which computes a \ref MachineCycleInfo.
+class MachineCycleInfoWrapperPass : public MachineFunctionPass {
+  MachineFunction *F = nullptr;
+  MachineCycleInfo CI;
+
+public:
+  static char ID;
+
+  MachineCycleInfoWrapperPass();
+
+  MachineCycleInfo &getCycleInfo() { return CI; }
+  const MachineCycleInfo &getCycleInfo() const { return CI; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+  // TODO: verify analysis
+};
+
+class MachineCycleInfoPrinterPass : public MachineFunctionPass {
+public:
+  static char ID;
+
+  MachineCycleInfoPrinterPass();
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+} // namespace
+
+char MachineCycleInfoWrapperPass::ID = 0;
+
+MachineCycleInfoWrapperPass::MachineCycleInfoWrapperPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineCycleInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(MachineCycleInfoWrapperPass, "machine-cycles",
+                      "Machine Cycle Info Analysis", true, true)
+INITIALIZE_PASS_END(MachineCycleInfoWrapperPass, "machine-cycles",
+                    "Machine Cycle Info Analysis", true, true)
+
+void MachineCycleInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineCycleInfoWrapperPass::runOnMachineFunction(MachineFunction &Func) {
+  CI.clear();
+
+  F = &Func;
+  CI.compute(Func);
+  return false;
+}
+
+void MachineCycleInfoWrapperPass::print(raw_ostream &OS, const Module *) const {
+  OS << "MachineCycleInfo for function: " << F->getName() << "\n";
+  CI.print(OS);
+}
+
+void MachineCycleInfoWrapperPass::releaseMemory() {
+  CI.clear();
+  F = nullptr;
+}
+
+char MachineCycleInfoPrinterPass::ID = 0;
+
+MachineCycleInfoPrinterPass::MachineCycleInfoPrinterPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineCycleInfoPrinterPassPass(*PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(MachineCycleInfoPrinterPass, "print-machine-cycles",
+                      "Print Machine Cycle Info Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
+INITIALIZE_PASS_END(MachineCycleInfoPrinterPass, "print-machine-cycles",
+                    "Print Machine Cycle Info Analysis", true, true)
+
+void MachineCycleInfoPrinterPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineCycleInfoWrapperPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineCycleInfoPrinterPass::runOnMachineFunction(MachineFunction &F) {
+  auto &CI = getAnalysis<MachineCycleInfoWrapperPass>();
+  CI.print(errs());
+  return false;
+}
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 310c2721c3bd..81ed3d0e93ff 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -89,6 +89,7 @@ static cl::opt<unsigned> AlignAllFunctions(
 static const char *getPropertyName(MachineFunctionProperties::Property Prop) {
   using P = MachineFunctionProperties::Property;
 
+  // clang-format off
   switch(Prop) {
   case P::FailedISel: return "FailedISel";
   case P::IsSSA: return "IsSSA";
@@ -100,7 +101,9 @@ static const char *getPropertyName(MachineFunctionProperties::Property Prop) {
   case P::TracksLiveness: return "TracksLiveness";
   case P::TiedOpsRewritten: return "TiedOpsRewritten";
   case P::FailsVerification: return "FailsVerification";
+  case P::TracksDebugUserValues: return "TracksDebugUserValues";
   }
+  // clang-format on
   llvm_unreachable("Invalid machine function property");
 }
 
@@ -125,7 +128,7 @@ void MachineFunctionProperties::print(raw_ostream &OS) const {
 MachineFunctionInfo::~MachineFunctionInfo() = default;
 
 void ilist_alloc_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
-  MBB->getParent()->DeleteMachineBasicBlock(MBB);
+  MBB->getParent()->deleteMachineBasicBlock(MBB);
 }
 
 static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI,
@@ -347,10 +350,10 @@ void MachineFunction::assignBeginEndSections() {
 
 /// Allocate a new MachineInstr. Use this instead of `new MachineInstr'.
 MachineInstr *MachineFunction::CreateMachineInstr(const MCInstrDesc &MCID,
-                                                  const DebugLoc &DL,
+                                                  DebugLoc DL,
                                                   bool NoImplicit) {
   return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
-      MachineInstr(*this, MCID, DL, NoImplicit);
+      MachineInstr(*this, MCID, std::move(DL), NoImplicit);
 }
 
 /// Create a new MachineInstr which is a copy of the 'Orig' instruction,
@@ -361,8 +364,9 @@ MachineFunction::CloneMachineInstr(const MachineInstr *Orig) {
              MachineInstr(*this, *Orig);
 }
 
-MachineInstr &MachineFunction::CloneMachineInstrBundle(MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator InsertBefore, const MachineInstr &Orig) {
+MachineInstr &MachineFunction::cloneMachineInstrBundle(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    const MachineInstr &Orig) {
   MachineInstr *FirstClone = nullptr;
   MachineBasicBlock::const_instr_iterator I = Orig.getIterator();
   while (true) {
@@ -390,8 +394,7 @@ MachineInstr &MachineFunction::CloneMachineInstrBundle(MachineBasicBlock &MBB,
 ///
 /// This function also serves as the MachineInstr destructor - the real
 /// ~MachineInstr() destructor must be empty.
-void
-MachineFunction::DeleteMachineInstr(MachineInstr *MI) {
+void MachineFunction::deleteMachineInstr(MachineInstr *MI) {
   // Verify that a call site info is at valid state. This assertion should
   // be triggered during the implementation of support for the
   // call site info of a new architecture. If the assertion is triggered,
@@ -418,8 +421,7 @@ MachineFunction::CreateMachineBasicBlock(const BasicBlock *bb) {
 }
 
 /// Delete the given MachineBasicBlock.
-void
-MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
+void MachineFunction::deleteMachineBasicBlock(MachineBasicBlock *MBB) {
   assert(MBB->getParent() == this && "MBB parent mismatch!");
   // Clean up any references to MBB in jump tables before deleting it.
   if (JumpTableInfo)
@@ -769,8 +771,8 @@ MCSymbol *MachineFunction::addLandingPad(MachineBasicBlock *LandingPad) {
 void MachineFunction::addCatchTypeInfo(MachineBasicBlock *LandingPad,
                                        ArrayRef<const GlobalValue *> TyInfo) {
   LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
-  for (unsigned N = TyInfo.size(); N; --N)
-    LP.TypeIds.push_back(getTypeIDFor(TyInfo[N - 1]));
+  for (const GlobalValue *GV : llvm::reverse(TyInfo))
+    LP.TypeIds.push_back(getTypeIDFor(GV));
 }
 
 void MachineFunction::addFilterTypeInfo(MachineBasicBlock *LandingPad,
@@ -1404,10 +1406,10 @@ MachineConstantPool::~MachineConstantPool() {
   // A constant may be a member of both Constants and MachineCPVsSharingEntries,
   // so keep track of which we've deleted to avoid double deletions.
   DenseSet<MachineConstantPoolValue*> Deleted;
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i)
-    if (Constants[i].isMachineConstantPoolEntry()) {
-      Deleted.insert(Constants[i].Val.MachineCPVal);
-      delete Constants[i].Val.MachineCPVal;
+  for (const MachineConstantPoolEntry &C : Constants)
+    if (C.isMachineConstantPoolEntry()) {
+      Deleted.insert(C.Val.MachineCPVal);
+      delete C.Val.MachineCPVal;
     }
   for (MachineConstantPoolValue *CPV : MachineCPVsSharingEntries) {
     if (Deleted.count(CPV) == 0)
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index aaa80432d2f2..85b266afceef 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -115,10 +115,10 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
 /// MachineInstr ctor - This constructor creates a MachineInstr and adds the
 /// implicit operands. It reserves space for the number of operands specified by
 /// the MCInstrDesc.
-MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
-                           DebugLoc dl, bool NoImp)
-    : MCID(&tid), debugLoc(std::move(dl)), DebugInstrNum(0) {
-  assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
+MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &TID,
+                           DebugLoc DL, bool NoImp)
+    : MCID(&TID), DbgLoc(std::move(DL)), DebugInstrNum(0) {
+  assert(DbgLoc.hasTrivialDestructor() && "Expected trivial destructor");
 
   // Reserve space for the expected number of operands.
   if (unsigned NumOps = MCID->getNumOperands() +
@@ -135,9 +135,9 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
 /// Does not copy the number from debug instruction numbering, to preserve
 /// uniqueness.
 MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
-    : MCID(&MI.getDesc()), Info(MI.Info), debugLoc(MI.getDebugLoc()),
+    : MCID(&MI.getDesc()), Info(MI.Info), DbgLoc(MI.getDebugLoc()),
       DebugInstrNum(0) {
-  assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
+  assert(DbgLoc.hasTrivialDestructor() && "Expected trivial destructor");
 
   CapOperands = OperandCapacity::get(MI.getNumOperands());
   Operands = MF.allocateOperandArray(CapOperands);
@@ -682,26 +682,6 @@ void MachineInstr::eraseFromParent() {
   getParent()->erase(this);
 }
 
-void MachineInstr::eraseFromParentAndMarkDBGValuesForRemoval() {
-  assert(getParent() && "Not embedded in a basic block!");
-  MachineBasicBlock *MBB = getParent();
-  MachineFunction *MF = MBB->getParent();
-  assert(MF && "Not embedded in a function!");
-
-  MachineInstr *MI = (MachineInstr *)this;
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  for (const MachineOperand &MO : MI->operands()) {
-    if (!MO.isReg() || !MO.isDef())
-      continue;
-    Register Reg = MO.getReg();
-    if (!Reg.isVirtual())
-      continue;
-    MRI.markUsesInDebugValueAsUndef(Reg);
-  }
-  MI->eraseFromParent();
-}
-
 void MachineInstr::eraseFromBundle() {
   assert(getParent() && "Not embedded in a basic block!");
   getParent()->erase_instr(this);
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 8d6459a627fa..762395542b40 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -649,7 +649,7 @@ void SwingSchedulerDAG::schedule() {
 /// Clean up after the software pipeliner runs.
 void SwingSchedulerDAG::finishBlock() {
   for (auto &KV : NewMIs)
-    MF.DeleteMachineInstr(KV.second);
+    MF.deleteMachineInstr(KV.second);
   NewMIs.clear();
 
   // Call the superclass.
@@ -1101,17 +1101,15 @@ unsigned SwingSchedulerDAG::calculateResMII() {
   // Sort the instructions by the number of available choices for scheduling,
   // least to most. Use the number of critical resources as the tie breaker.
   FuncUnitSorter FUS = FuncUnitSorter(MF.getSubtarget());
-  for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
-                                   E = MBB->getFirstTerminator();
-       I != E; ++I)
-    FUS.calcCriticalResources(*I);
+  for (MachineInstr &MI :
+       llvm::make_range(MBB->getFirstNonPHI(), MBB->getFirstTerminator()))
+    FUS.calcCriticalResources(MI);
   PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
       FuncUnitOrder(FUS);
 
-  for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
-                                   E = MBB->getFirstTerminator();
-       I != E; ++I)
-    FuncUnitOrder.push(&*I);
+  for (MachineInstr &MI :
+       llvm::make_range(MBB->getFirstNonPHI(), MBB->getFirstTerminator()))
+    FuncUnitOrder.push(&MI);
 
   while (!FuncUnitOrder.empty()) {
     MachineInstr *MI = FuncUnitOrder.top();
@@ -1192,14 +1190,10 @@ unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
 /// but we do this to find the circuits, and then change them back.
 static void swapAntiDependences(std::vector<SUnit> &SUnits) {
   SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
-  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
-         IP != EP; ++IP) {
-      if (IP->getKind() != SDep::Anti)
-        continue;
-      DepsAdded.push_back(std::make_pair(SU, *IP));
-    }
+  for (SUnit &SU : SUnits) {
+    for (SDep &Pred : SU.Preds)
+      if (Pred.getKind() == SDep::Anti)
+        DepsAdded.push_back(std::make_pair(&SU, Pred));
   }
   for (std::pair<SUnit *, SDep> &P : DepsAdded) {
     // Remove this anti dependency and add one in the reverse direction.
@@ -1471,27 +1465,23 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
   }
 
   // Compute ALAP, ZeroLatencyHeight, and MOV.
-  for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
-                                                          E = Topo.rend();
-       I != E; ++I) {
+  for (int I : llvm::reverse(Topo)) {
     int alap = maxASAP;
     int zeroLatencyHeight = 0;
-    SUnit *SU = &SUnits[*I];
-    for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
-                                    ES = SU->Succs.end();
-         IS != ES; ++IS) {
-      SUnit *succ = IS->getSUnit();
-      if (IS->getLatency() == 0)
+    SUnit *SU = &SUnits[I];
+    for (const SDep &S : SU->Succs) {
+      SUnit *succ = S.getSUnit();
+      if (S.getLatency() == 0)
         zeroLatencyHeight =
             std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
-      if (ignoreDependence(*IS, true))
+      if (ignoreDependence(S, true))
         continue;
-      alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
-                                  getDistance(SU, succ, *IS) * MII));
+      alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
+                                  getDistance(SU, succ, S) * MII));
     }
 
-    ScheduleInfo[*I].ALAP = alap;
-    ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
+    ScheduleInfo[I].ALAP = alap;
+    ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
   }
 
   // After computing the node functions, compute the summary for each node set.
@@ -1548,9 +1538,8 @@ static bool succ_L(SetVector<SUnit *> &NodeOrder,
                    SmallSetVector<SUnit *, 8> &Succs,
                    const NodeSet *S = nullptr) {
   Succs.clear();
-  for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
-       I != E; ++I) {
-    for (SDep &Succ : (*I)->Succs) {
+  for (const SUnit *SU : NodeOrder) {
+    for (const SDep &Succ : SU->Succs) {
       if (S && S->count(Succ.getSUnit()) == 0)
         continue;
       if (ignoreDependence(Succ, false))
@@ -1558,7 +1547,7 @@ static bool succ_L(SetVector<SUnit *> &NodeOrder,
       if (NodeOrder.count(Succ.getSUnit()) == 0)
         Succs.insert(Succ.getSUnit());
     }
-    for (SDep &Pred : (*I)->Preds) {
+    for (const SDep &Pred : SU->Preds) {
       if (Pred.getKind() != SDep::Anti)
         continue;
       if (S && S->count(Pred.getSUnit()) == 0)
@@ -2202,7 +2191,7 @@ bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
   MachineInstr *NewMI = MF.CloneMachineInstr(MI);
   NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
   bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
-  MF.DeleteMachineInstr(NewMI);
+  MF.deleteMachineInstr(NewMI);
   if (!Disjoint)
     return false;
 
@@ -2885,10 +2874,8 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
          ++stage) {
       std::deque<SUnit *> &cycleInstrs =
           ScheduledInstrs[cycle + (stage * InitiationInterval)];
-      for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
-                                                 E = cycleInstrs.rend();
-           I != E; ++I)
-        ScheduledInstrs[cycle].push_front(*I);
+      for (SUnit *SU : llvm::reverse(cycleInstrs))
+        ScheduledInstrs[cycle].push_front(SU);
     }
   }
 
@@ -2899,10 +2886,8 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
 
   // Change the registers in instruction as specified in the InstrChanges
   // map. We need to use the new registers to create the correct order.
-  for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
-    SUnit *SU = &SSD->SUnits[i];
-    SSD->applyInstrChange(SU->getInstr(), *this);
-  }
+  for (const SUnit &SU : SSD->SUnits)
+    SSD->applyInstrChange(SU.getInstr(), *this);
 
   // Reorder the instructions in each cycle to fix and improve the
   // generated code.
diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp
new file mode 100644
index 000000000000..8db893535daf
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -0,0 +1,52 @@
+//===- MachineSSAContext.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a specialization of the GenericSSAContext<X>
+/// template class for Machine IR.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineSSAContext.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MachineBasicBlock *MachineSSAContext::getEntryBlock(MachineFunction &F) {
+  return &F.front();
+}
+
+void MachineSSAContext::setFunction(MachineFunction &Fn) {
+  MF = &Fn;
+  RegInfo = &MF->getRegInfo();
+}
+
+Printable MachineSSAContext::print(MachineBasicBlock *Block) const {
+  return Printable([Block](raw_ostream &Out) { Block->printName(Out); });
+}
+
+Printable MachineSSAContext::print(MachineInstr *I) const {
+  return Printable([I](raw_ostream &Out) { I->print(Out); });
+}
+
+Printable MachineSSAContext::print(Register Value) const {
+  auto *MRI = RegInfo;
+  return Printable([MRI, Value](raw_ostream &Out) {
+    Out << printReg(Value, MRI->getTargetRegisterInfo(), 0, MRI);
+
+    if (Value) {
+      // Try to print the definition.
+      if (auto *Instr = MRI->getUniqueVRegDef(Value)) {
+        Out << ": ";
+        Instr->print(Out);
+      }
+    }
+  });
+}
diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
index 930677e4fd7d..48076663ddf5 100644
--- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -126,7 +126,9 @@ MachineInstrBuilder InsertNewDef(unsigned Opcode,
 }
 
 /// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that
-/// is live in the middle of the specified block.
+/// is live in the middle of the specified block. If ExistingValueOnly is
+/// true then this will only return an existing value or $noreg; otherwise new
+/// instructions may be inserted to materialize a value.
 ///
 /// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one
 /// important case: if there is a definition of the rewritten value after the
@@ -143,14 +145,18 @@ MachineInstrBuilder InsertNewDef(unsigned Opcode,
 /// their respective blocks.  However, the use of X happens in the *middle* of
 /// a block.  Because of this, we need to insert a new PHI node in SomeBB to
 /// merge the appropriate values, and this value isn't live out of the block.
-Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
+Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB,
+                                                    bool ExistingValueOnly) {
   // If there is no definition of the renamed variable in this block, just use
   // GetValueAtEndOfBlock to do our work.
   if (!HasValueForBlock(BB))
-    return GetValueAtEndOfBlockInternal(BB);
+    return GetValueAtEndOfBlockInternal(BB, ExistingValueOnly);
 
   // If there are no predecessors, just return undef.
   if (BB->pred_empty()) {
+    // If we cannot insert new instructions, just return $noreg.
+    if (ExistingValueOnly)
+      return Register();
     // Insert an implicit_def to represent an undef value.
     MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
                                         BB, BB->getFirstTerminator(),
@@ -165,7 +171,7 @@ Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
 
   bool isFirstPred = true;
   for (MachineBasicBlock *PredBB : BB->predecessors()) {
-    Register PredVal = GetValueAtEndOfBlockInternal(PredBB);
+    Register PredVal = GetValueAtEndOfBlockInternal(PredBB, ExistingValueOnly);
     PredValues.push_back(std::make_pair(PredBB, PredVal));
 
     // Compute SingularValue.
@@ -185,6 +191,10 @@ Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
   if (DupPHI)
     return DupPHI;
 
+  // If we cannot create new instructions, return $noreg now.
+  if (ExistingValueOnly)
+    return Register();
+
   // Otherwise, we do need a PHI: insert one now.
   MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
   MachineInstrBuilder InsertedPHI = InsertNewDef(TargetOpcode::PHI, BB,
@@ -350,10 +360,13 @@ public:
 /// for the specified BB and if so, return it.  If not, construct SSA form by
 /// first calculating the required placement of PHIs and then inserting new
 /// PHIs where needed.
-Register MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB){
+Register
+MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB,
+                                                bool ExistingValueOnly) {
   AvailableValsTy &AvailableVals = getAvailableVals(AV);
-  if (Register V = AvailableVals[BB])
-    return V;
+  Register ExistingVal = AvailableVals.lookup(BB);
+  if (ExistingVal || ExistingValueOnly)
+    return ExistingVal;
 
   SSAUpdaterImpl<MachineSSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
   return Impl.GetValue(BB);
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 47d40f0823c8..b043d4c1b0c1 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -90,12 +90,17 @@ cl::opt<bool> VerifyScheduling(
     "verify-misched", cl::Hidden,
     cl::desc("Verify machine instrs before and after machine scheduling"));
 
+#ifndef NDEBUG
+cl::opt<bool> ViewMISchedDAGs(
+    "view-misched-dags", cl::Hidden,
+    cl::desc("Pop up a window to show MISched dags after they are processed"));
+#else
+const bool ViewMISchedDAGs = false;
+#endif // NDEBUG
+
 } // end namespace llvm
 
 #ifndef NDEBUG
-static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden,
-  cl::desc("Pop up a window to show MISched dags after they are processed"));
-
 /// In some situations a few uninteresting nodes depend on nearly all other
 /// nodes in the graph, provide a cutoff to hide them.
 static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden,
@@ -111,7 +116,6 @@ static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden,
 static cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
                               cl::desc("Print schedule DAGs"));
 #else
-static const bool ViewMISchedDAGs = false;
 static const bool PrintDAGs = false;
 #endif // NDEBUG
 
@@ -561,11 +565,10 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
 
     MBBRegionsVector MBBRegions;
     getSchedRegions(&*MBB, MBBRegions, Scheduler.doMBBSchedRegionsTopDown());
-    for (MBBRegionsVector::iterator R = MBBRegions.begin();
-         R != MBBRegions.end(); ++R) {
-      MachineBasicBlock::iterator I = R->RegionBegin;
-      MachineBasicBlock::iterator RegionEnd = R->RegionEnd;
-      unsigned NumRegionInstrs = R->NumRegionInstrs;
+    for (const SchedRegion &R : MBBRegions) {
+      MachineBasicBlock::iterator I = R.RegionBegin;
+      MachineBasicBlock::iterator RegionEnd = R.RegionEnd;
+      unsigned NumRegionInstrs = R.NumRegionInstrs;
 
       // Notify the scheduler of the region, even if we may skip scheduling
       // it. Perhaps it still needs to be bundled.
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index 8df23b781ffd..0a5ff276fedc 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -80,9 +80,9 @@ bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
 void MachineTraceMetrics::releaseMemory() {
   MF = nullptr;
   BlockInfo.clear();
-  for (unsigned i = 0; i != TS_NumStrategies; ++i) {
-    delete Ensembles[i];
-    Ensembles[i] = nullptr;
+  for (Ensemble *&E : Ensembles) {
+    delete E;
+    E = nullptr;
   }
 }
 
@@ -398,9 +398,9 @@ void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "Invalidate traces through " << printMBBReference(*MBB)
                     << '\n');
   BlockInfo[MBB->getNumber()].invalidate();
-  for (unsigned i = 0; i != TS_NumStrategies; ++i)
-    if (Ensembles[i])
-      Ensembles[i]->invalidate(MBB);
+  for (Ensemble *E : Ensembles)
+    if (E)
+      E->invalidate(MBB);
 }
 
 void MachineTraceMetrics::verifyAnalysis() const {
@@ -408,9 +408,9 @@ void MachineTraceMetrics::verifyAnalysis() const {
     return;
 #ifndef NDEBUG
   assert(BlockInfo.size() == MF->getNumBlockIDs() && "Outdated BlockInfo size");
-  for (unsigned i = 0; i != TS_NumStrategies; ++i)
-    if (Ensembles[i])
-      Ensembles[i]->verify();
+  for (Ensemble *E : Ensembles)
+    if (E)
+      E->verify();
 #endif
 }
 
@@ -984,8 +984,7 @@ addLiveIns(const MachineInstr *DefMI, unsigned DefOp,
   const MachineBasicBlock *DefMBB = DefMI->getParent();
 
   // Reg is live-in to all blocks in Trace that follow DefMBB.
-  for (unsigned i = Trace.size(); i; --i) {
-    const MachineBasicBlock *MBB = Trace[i-1];
+  for (const MachineBasicBlock *MBB : llvm::reverse(Trace)) {
     if (MBB == DefMBB)
       return;
     TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
@@ -1204,8 +1203,8 @@ unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
     for (unsigned K = 0; K != PRDepths.size(); ++K)
       PRMax = std::max(PRMax, PRDepths[K] + PRCycles[K]);
   } else {
-    for (unsigned K = 0; K != PRDepths.size(); ++K)
-      PRMax = std::max(PRMax, PRDepths[K]);
+    for (unsigned PRD : PRDepths)
+      PRMax = std::max(PRMax, PRD);
   }
   // Convert to cycle count.
   PRMax = TE.MTM.getCycles(PRMax);
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 32078db76cf3..005d4ad1a328 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -101,6 +101,7 @@ namespace {
     // Avoid querying the MachineFunctionProperties for each operand.
     bool isFunctionRegBankSelected;
     bool isFunctionSelected;
+    bool isFunctionTracksDebugUserValues;
 
     using RegVector = SmallVector<Register, 16>;
     using RegMaskVector = SmallVector<const uint32_t *, 4>;
@@ -384,6 +385,8 @@ unsigned MachineVerifier::verify(const MachineFunction &MF) {
       MachineFunctionProperties::Property::RegBankSelected);
   isFunctionSelected = MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::Selected);
+  isFunctionTracksDebugUserValues = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::TracksDebugUserValues);
 
   LiveVars = nullptr;
   LiveInts = nullptr;
@@ -1605,12 +1608,16 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     }
     break;
   }
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_ASHR:
   case TargetOpcode::G_ROTR:
   case TargetOpcode::G_ROTL: {
     LLT Src1Ty = MRI->getType(MI->getOperand(1).getReg());
     LLT Src2Ty = MRI->getType(MI->getOperand(2).getReg());
     if (Src1Ty.isVector() != Src2Ty.isVector()) {
-      report("Rotate requires operands to be either all scalars or all vectors",
+      report("Shifts and rotates require operands to be either all scalars or "
+             "all vectors",
              MI);
       break;
     }
@@ -1980,41 +1987,50 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         if (MO->isUndef())
           report("Generic virtual register use cannot be undef", MO, MONum);
 
-        // If we're post-Select, we can't have gvregs anymore.
-        if (isFunctionSelected) {
-          report("Generic virtual register invalid in a Selected function",
-                 MO, MONum);
-          return;
-        }
+        // Debug value instruction is permitted to use undefined vregs.
+        // This is a performance measure to skip the overhead of immediately
+        // pruning unused debug operands. The final undef substitution occurs
+        // when debug values are allocated in LDVImpl::handleDebugValue, so
+        // these verifications always apply after this pass.
+        if (isFunctionTracksDebugUserValues || !MO->isUse() ||
+            !MI->isDebugValue() || !MRI->def_empty(Reg)) {
+          // If we're post-Select, we can't have gvregs anymore.
+          if (isFunctionSelected) {
+            report("Generic virtual register invalid in a Selected function",
+                   MO, MONum);
+            return;
+          }
 
-        // The gvreg must have a type and it must not have a SubIdx.
-        LLT Ty = MRI->getType(Reg);
-        if (!Ty.isValid()) {
-          report("Generic virtual register must have a valid type", MO,
-                 MONum);
-          return;
-        }
+          // The gvreg must have a type and it must not have a SubIdx.
+          LLT Ty = MRI->getType(Reg);
+          if (!Ty.isValid()) {
+            report("Generic virtual register must have a valid type", MO,
+                   MONum);
+            return;
+          }
 
-        const RegisterBank *RegBank = MRI->getRegBankOrNull(Reg);
+          const RegisterBank *RegBank = MRI->getRegBankOrNull(Reg);
 
-        // If we're post-RegBankSelect, the gvreg must have a bank.
-        if (!RegBank && isFunctionRegBankSelected) {
-          report("Generic virtual register must have a bank in a "
-                 "RegBankSelected function",
-                 MO, MONum);
-          return;
-        }
+          // If we're post-RegBankSelect, the gvreg must have a bank.
+          if (!RegBank && isFunctionRegBankSelected) {
+            report("Generic virtual register must have a bank in a "
+                   "RegBankSelected function",
+                   MO, MONum);
+            return;
+          }
 
-        // Make sure the register fits into its register bank if any.
-        if (RegBank && Ty.isValid() &&
-            RegBank->getSize() < Ty.getSizeInBits()) {
-          report("Register bank is too small for virtual register", MO,
-                 MONum);
-          errs() << "Register bank " << RegBank->getName() << " too small("
-                 << RegBank->getSize() << ") to fit " << Ty.getSizeInBits()
-                 << "-bits\n";
-          return;
+          // Make sure the register fits into its register bank if any.
+          if (RegBank && Ty.isValid() &&
+              RegBank->getSize() < Ty.getSizeInBits()) {
+            report("Register bank is too small for virtual register", MO,
+                   MONum);
+            errs() << "Register bank " << RegBank->getName() << " too small("
+                   << RegBank->getSize() << ") to fit " << Ty.getSizeInBits()
+                   << "-bits\n";
+            return;
+          }
         }
+
         if (SubIdx)  {
           report("Generic virtual register does not allow subregister index", MO,
                  MONum);
@@ -2217,8 +2233,8 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
   if (LiveInts && Reg.isVirtual()) {
     if (LiveInts->hasInterval(Reg)) {
       LI = &LiveInts->getInterval(Reg);
-      if (SubRegIdx != 0 && !LI->empty() && !LI->hasSubRanges() &&
-          MRI->shouldTrackSubRegLiveness(Reg))
+      if (SubRegIdx != 0 && (MO->isDef() || !MO->isUndef()) && !LI->empty() &&
+          !LI->hasSubRanges() && MRI->shouldTrackSubRegLiveness(Reg))
         report("Live interval for subreg operand has no subranges", MO, MONum);
     } else {
       report("Virtual register has no live interval", MO, MONum);
diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index 77a6c37e1362..7693ab417de9 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -213,7 +213,7 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
   for (auto &I : LoweredPHIs) {
     if (LIS)
       LIS->RemoveMachineInstrFromMaps(*I.first);
-    MF.DeleteMachineInstr(I.first);
+    MF.deleteMachineInstr(I.first);
   }
 
   // TODO: we should use the incremental DomTree updater here.
@@ -626,7 +626,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
   if (reusedIncoming || !IncomingReg) {
     if (LIS)
       LIS->RemoveMachineInstrFromMaps(*MPhi);
-    MF.DeleteMachineInstr(MPhi);
+    MF.deleteMachineInstr(MPhi);
   }
 }
 
diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp
index b85f00a61eac..d7cd0a583cee 100644
--- a/llvm/lib/CodeGen/PostRASchedulerList.cpp
+++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -252,8 +252,8 @@ void SchedulePostRATDList::exitRegion() {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dumpSchedule - dump the scheduled Sequence.
 LLVM_DUMP_METHOD void SchedulePostRATDList::dumpSchedule() const {
-  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
-    if (SUnit *SU = Sequence[i])
+  for (const SUnit *SU : Sequence) {
+    if (SU)
       dumpNode(*SU);
     else
       dbgs() << "**** NOOP ****\n";
@@ -531,11 +531,11 @@ void SchedulePostRATDList::ListScheduleTopDown() {
   ReleaseSuccessors(&EntrySU);
 
   // Add all leaves to Available queue.
-  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+  for (SUnit &SUnit : SUnits) {
     // It is available if it has no predecessors.
-    if (!SUnits[i].NumPredsLeft && !SUnits[i].isAvailable) {
-      AvailableQueue.push(&SUnits[i]);
-      SUnits[i].isAvailable = true;
+    if (!SUnit.NumPredsLeft && !SUnit.isAvailable) {
+      AvailableQueue.push(&SUnit);
+      SUnit.isAvailable = true;
     }
   }
 
@@ -657,10 +657,7 @@ void SchedulePostRATDList::ListScheduleTopDown() {
 
 #ifndef NDEBUG
   unsigned ScheduledNodes = VerifyScheduledDAG(/*isBottomUp=*/false);
-  unsigned Noops = 0;
-  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
-    if (!Sequence[i])
-      ++Noops;
+  unsigned Noops = llvm::count(Sequence, nullptr);
   assert(Sequence.size() - Noops == ScheduledNodes &&
          "The number of nodes scheduled doesn't match the expected number!");
 #endif // NDEBUG
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 29a88480fd9f..8d8a6126dad0 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -953,12 +953,22 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
     // LocalStackSlotPass didn't already allocate a slot for it.
     // If we are told to use the LocalStackAllocationBlock, the stack protector
     // is expected to be already pre-allocated.
-    if (!MFI.getUseLocalStackAllocationBlock())
+    if (MFI.getStackID(StackProtectorFI) != TargetStackID::Default) {
+      // If the stack protector isn't on the default stack then it's up to the
+      // target to set the stack offset.
+      assert(MFI.getObjectOffset(StackProtectorFI) != 0 &&
+             "Offset of stack protector on non-default stack expected to be "
+             "already set.");
+      assert(!MFI.isObjectPreAllocated(MFI.getStackProtectorIndex()) &&
+             "Stack protector on non-default stack expected to not be "
+             "pre-allocated by LocalStackSlotPass.");
+    } else if (!MFI.getUseLocalStackAllocationBlock()) {
       AdjustStackOffset(MFI, StackProtectorFI, StackGrowsDown, Offset, MaxAlign,
                         Skew);
-    else if (!MFI.isObjectPreAllocated(MFI.getStackProtectorIndex()))
+    } else if (!MFI.isObjectPreAllocated(MFI.getStackProtectorIndex())) {
       llvm_unreachable(
           "Stack protector not pre-allocated by LocalStackSlotPass.");
+    }
 
     // Assign large stack objects first.
     for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
diff --git a/llvm/lib/CodeGen/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp
index f605068e076d..882f8e91bf1d 100644
--- a/llvm/lib/CodeGen/RDFGraph.cpp
+++ b/llvm/lib/CodeGen/RDFGraph.cpp
@@ -1500,8 +1500,8 @@ void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, RegisterSet &AllRefs,
 
     // Erase from MaxRefs all elements in the closure.
     auto Begin = MaxRefs.begin();
-    for (unsigned i = ClosureIdx.size(); i != 0; --i)
-      MaxRefs.erase(Begin + ClosureIdx[i-1]);
+    for (unsigned Idx : llvm::reverse(ClosureIdx))
+      MaxRefs.erase(Begin + Idx);
   }
 }
 
diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
new file mode 100644
index 000000000000..9f1012c95964
--- /dev/null
+++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
@@ -0,0 +1,121 @@
+//===- RegAllocEvictionAdvisor.cpp - eviction advisor ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the default eviction advisor and of the Analysis pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegAllocEvictionAdvisor.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<RegAllocEvictionAdvisorAnalysis::AdvisorMode> Mode(
+    "regalloc-enable-advisor", cl::Hidden,
+    cl::init(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default),
+    cl::desc("Enable regalloc advisor mode"),
+    cl::values(
+        clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default,
+                   "default", "Default"),
+        clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release,
+                   "release", "precompiled"),
+        clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development,
+                   "development", "for training")));
+
+static cl::opt<bool> EnableLocalReassignment(
+    "enable-local-reassign", cl::Hidden,
+    cl::desc("Local reassignment can yield better allocation decisions, but "
+             "may be compile time intensive"),
+    cl::init(false));
+
+#define DEBUG_TYPE "regalloc"
+
+char RegAllocEvictionAdvisorAnalysis::ID = 0;
+INITIALIZE_PASS(RegAllocEvictionAdvisorAnalysis, "regalloc-evict",
+                "Regalloc eviction policy", false, true)
+
+namespace {
+class DefaultEvictionAdvisorAnalysis final
+    : public RegAllocEvictionAdvisorAnalysis {
+public:
+  DefaultEvictionAdvisorAnalysis(bool NotAsRequested)
+      : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Default),
+        NotAsRequested(NotAsRequested) {}
+
+  // support for isa<> and dyn_cast.
+  static bool classof(const RegAllocEvictionAdvisorAnalysis *R) {
+    return R->getAdvisorMode() == AdvisorMode::Default;
+  }
+
+private:
+  std::unique_ptr<RegAllocEvictionAdvisor>
+  getAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix,
+             LiveIntervals *LIS, VirtRegMap *VRM,
+             const RegisterClassInfo &RegClassInfo,
+             ExtraRegInfo *ExtraInfo) override {
+    return std::make_unique<DefaultEvictionAdvisor>(MF, Matrix, LIS, VRM,
+                                                    RegClassInfo, ExtraInfo);
+  }
+  bool doInitialization(Module &M) override {
+    if (NotAsRequested)
+      M.getContext().emitError("Requested regalloc eviction advisor analysis "
+                               "could be created. Using default");
+    return RegAllocEvictionAdvisorAnalysis::doInitialization(M);
+  }
+  const bool NotAsRequested;
+};
+} // namespace
+
+template <> Pass *llvm::callDefaultCtor<RegAllocEvictionAdvisorAnalysis>() {
+  Pass *Ret = nullptr;
+  switch (Mode) {
+  case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default:
+    Ret = new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ false);
+    break;
+  case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development:
+    // TODO(mtrofin): add implementation
+    break;
+  case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release:
+    // TODO(mtrofin): add implementation
+    break;
+  }
+  if (Ret)
+    return Ret;
+  return new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ true);
+}
+
+StringRef RegAllocEvictionAdvisorAnalysis::getPassName() const {
+  switch (getAdvisorMode()) {
+  case AdvisorMode::Default:
+    return "Default Regalloc Eviction Advisor";
+  case AdvisorMode::Release:
+    return "Release mode Regalloc Eviction Advisor";
+  case AdvisorMode::Development:
+    return "Development mode Regalloc Eviction Advisor";
+  }
+  llvm_unreachable("Unknown advisor kind");
+}
+
+RegAllocEvictionAdvisor::RegAllocEvictionAdvisor(
+    const MachineFunction &MF, LiveRegMatrix *Matrix, LiveIntervals *LIS,
+    VirtRegMap *VRM, const RegisterClassInfo &RegClassInfo,
+    ExtraRegInfo *ExtraInfo)
+    : MF(MF), Matrix(Matrix), LIS(LIS), VRM(VRM), MRI(&VRM->getRegInfo()),
+      TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RegClassInfo),
+      RegCosts(TRI->getRegisterCosts(MF)), ExtraInfo(ExtraInfo),
+      EnableLocalReassign(EnableLocalReassignment ||
+                          MF.getSubtarget().enableRALocalReassignment(
+                              MF.getTarget().getOptLevel())) {}
diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
index 85fd3207888b..debb75ed5020 100644
--- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
+++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
@@ -85,6 +86,215 @@ struct EvictionCost {
            std::tie(O.BrokenHints, O.MaxWeight);
   }
 };
+
+/// Track allocation stage and eviction loop prevention during allocation.
+// TODO(mtrofin): Consider exposing RAGreedy in a header instead, and folding
+// this back into it.
+class ExtraRegInfo final {
+  // RegInfo - Keep additional information about each live range.
+  struct RegInfo {
+    LiveRangeStage Stage = RS_New;
+
+    // Cascade - Eviction loop prevention. See
+    // canEvictInterferenceBasedOnCost().
+    unsigned Cascade = 0;
+
+    RegInfo() = default;
+  };
+
+  IndexedMap<RegInfo, VirtReg2IndexFunctor> Info;
+  unsigned NextCascade = 1;
+
+public:
+  ExtraRegInfo() = default;
+  ExtraRegInfo(const ExtraRegInfo &) = delete;
+
+  LiveRangeStage getStage(Register Reg) const { return Info[Reg].Stage; }
+
+  LiveRangeStage getStage(const LiveInterval &VirtReg) const {
+    return getStage(VirtReg.reg());
+  }
+
+  void setStage(Register Reg, LiveRangeStage Stage) {
+    Info.grow(Reg.id());
+    Info[Reg].Stage = Stage;
+  }
+
+  void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) {
+    setStage(VirtReg.reg(), Stage);
+  }
+
+  /// Return the current stage of the register, if present, otherwise initialize
+  /// it and return that.
+  LiveRangeStage getOrInitStage(Register Reg) {
+    Info.grow(Reg.id());
+    return getStage(Reg);
+  }
+
+  unsigned getCascade(Register Reg) const { return Info[Reg].Cascade; }
+
+  void setCascade(Register Reg, unsigned Cascade) {
+    Info.grow(Reg.id());
+    Info[Reg].Cascade = Cascade;
+  }
+
+  unsigned getOrAssignNewCascade(Register Reg) {
+    unsigned Cascade = getCascade(Reg);
+    if (!Cascade) {
+      Cascade = NextCascade++;
+      setCascade(Reg, Cascade);
+    }
+    return Cascade;
+  }
+
+  unsigned getCascadeOrCurrentNext(Register Reg) const {
+    unsigned Cascade = getCascade(Reg);
+    if (!Cascade)
+      Cascade = NextCascade;
+    return Cascade;
+  }
+
+  template <typename Iterator>
+  void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) {
+    for (; Begin != End; ++Begin) {
+      Register Reg = *Begin;
+      Info.grow(Reg.id());
+      if (Info[Reg].Stage == RS_New)
+        Info[Reg].Stage = NewStage;
+    }
+  }
+  void LRE_DidCloneVirtReg(Register New, Register Old);
+};
+
+/// Interface to the eviction advisor, which is responsible for making a
+/// decision as to which live ranges should be evicted (if any).
+class RegAllocEvictionAdvisor {
+public:
+  RegAllocEvictionAdvisor(const RegAllocEvictionAdvisor &) = delete;
+  RegAllocEvictionAdvisor(RegAllocEvictionAdvisor &&) = delete;
+  virtual ~RegAllocEvictionAdvisor() = default;
+
+  /// Find a physical register that can be freed by evicting the FixedRegisters,
+  /// or return NoRegister. The eviction decision is assumed to be correct (i.e.
+  /// no fixed live ranges are evicted) and profitable.
+  virtual MCRegister
+  tryFindEvictionCandidate(LiveInterval &VirtReg, const AllocationOrder &Order,
+                           uint8_t CostPerUseLimit,
+                           const SmallVirtRegSet &FixedRegisters) const = 0;
+
+  /// Find out if we can evict the live ranges occupying the given PhysReg,
+  /// which is a hint (preferred register) for VirtReg.
+  virtual bool
+  canEvictHintInterference(LiveInterval &VirtReg, MCRegister PhysReg,
+                           const SmallVirtRegSet &FixedRegisters) const = 0;
+
+  /// Returns true if the given \p PhysReg is a callee saved register and has
+  /// not been used for allocation yet.
+  bool isUnusedCalleeSavedReg(MCRegister PhysReg) const;
+
+protected:
+  RegAllocEvictionAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix,
+                          LiveIntervals *LIS, VirtRegMap *VRM,
+                          const RegisterClassInfo &RegClassInfo,
+                          ExtraRegInfo *ExtraInfo);
+
+  Register canReassign(LiveInterval &VirtReg, Register PrevReg) const;
+
+  const MachineFunction &MF;
+  LiveRegMatrix *const Matrix;
+  LiveIntervals *const LIS;
+  VirtRegMap *const VRM;
+  MachineRegisterInfo *const MRI;
+  const TargetRegisterInfo *const TRI;
+  const RegisterClassInfo &RegClassInfo;
+  const ArrayRef<uint8_t> RegCosts;
+  ExtraRegInfo *const ExtraInfo;
+
+  /// Run or not the local reassignment heuristic. This information is
+  /// obtained from the TargetSubtargetInfo.
+  const bool EnableLocalReassign;
+
+private:
+  unsigned NextCascade = 1;
+};
+
+/// ImmutableAnalysis abstraction for fetching the Eviction Advisor. We model it
+/// as an analysis to decouple the user from the implementation insofar as
+/// dependencies on other analyses goes. The motivation for it being an
+/// immutable pass is twofold:
+/// - in the ML implementation case, the evaluator is stateless but (especially
+/// in the development mode) expensive to set up. With an immutable pass, we set
+/// it up once.
+/// - in the 'development' mode ML case, we want to capture the training log
+/// during allocation (this is a log of features encountered and decisions
+/// made), and then measure a score, potentially a few steps after allocation
+/// completes. So we need the properties of an immutable pass to keep the logger
+/// state around until we can make that measurement.
+///
+/// Because we need to offer additional services in 'development' mode, the
+/// implementations of this analysis need to implement RTTI support.
+class RegAllocEvictionAdvisorAnalysis : public ImmutablePass {
+public:
+  enum class AdvisorMode : int { Default, Release, Development };
+
+  RegAllocEvictionAdvisorAnalysis(AdvisorMode Mode)
+      : ImmutablePass(ID), Mode(Mode){};
+  static char ID;
+
+  /// Get an advisor for the given context (i.e. machine function, etc)
+  virtual std::unique_ptr<RegAllocEvictionAdvisor>
+  getAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix,
+             LiveIntervals *LIS, VirtRegMap *VRM,
+             const RegisterClassInfo &RegClassInfo,
+             ExtraRegInfo *ExtraInfo) = 0;
+  AdvisorMode getAdvisorMode() const { return Mode; }
+
+private:
+  // This analysis preserves everything, and subclasses may have additional
+  // requirements.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  StringRef getPassName() const override;
+  const AdvisorMode Mode;
+};
+
+/// Specialization for the API used by the analysis infrastructure to create
+/// an instance of the eviction advisor.
+template <> Pass *callDefaultCtor<RegAllocEvictionAdvisorAnalysis>();
+
+// TODO(mtrofin): implement these.
+#ifdef LLVM_HAVE_TF_AOT
+RegAllocEvictionAdvisorAnalysis *createReleaseModeAdvisor();
+#endif
+
+#ifdef LLVM_HAVE_TF_API
+RegAllocEvictionAdvisorAnalysis *createDevelopmentModeAdvisor();
+#endif
+
+// TODO: move to RegAllocEvictionAdvisor.cpp when we move implementation
+// out of RegAllocGreedy.cpp
+class DefaultEvictionAdvisor : public RegAllocEvictionAdvisor {
+public:
+  DefaultEvictionAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix,
+                         LiveIntervals *LIS, VirtRegMap *VRM,
+                         const RegisterClassInfo &RegClassInfo,
+                         ExtraRegInfo *ExtraInfo)
+      : RegAllocEvictionAdvisor(MF, Matrix, LIS, VRM, RegClassInfo, ExtraInfo) {
+  }
+
+private:
+  MCRegister tryFindEvictionCandidate(LiveInterval &, const AllocationOrder &,
+                                      uint8_t,
+                                      const SmallVirtRegSet &) const override;
+  bool canEvictHintInterference(LiveInterval &, MCRegister,
+                                const SmallVirtRegSet &) const override;
+  bool canEvictInterferenceBasedOnCost(LiveInterval &, MCRegister, bool,
+                                       EvictionCost &,
+                                       const SmallVirtRegSet &) const;
+  bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool) const;
+};
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 50411c177007..ce3cf31dbd6b 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -112,12 +112,6 @@ static cl::opt<bool> ExhaustiveSearch(
              "and interference cutoffs of last chance recoloring"),
     cl::Hidden);
 
-static cl::opt<bool> EnableLocalReassignment(
-    "enable-local-reassign", cl::Hidden,
-    cl::desc("Local reassignment can yield better allocation decisions, but "
-             "may be compile time intensive"),
-    cl::init(false));
-
 static cl::opt<bool> EnableDeferredSpilling(
     "enable-deferred-spilling", cl::Hidden,
     cl::desc("Instead of spilling a variable right away, defer the actual "
@@ -172,8 +166,9 @@ class RAGreedy : public MachineFunctionPass,
   // state
   std::unique_ptr<Spiller> SpillerInstance;
   PQueue Queue;
-  unsigned NextCascade;
   std::unique_ptr<VirtRegAuxInfo> VRAI;
+  Optional<ExtraRegInfo> ExtraInfo;
+  std::unique_ptr<RegAllocEvictionAdvisor> EvictAdvisor;
 
   // Enum CutOffStage to keep a track whether the register allocation failed
   // because of the cutoffs encountered in last chance recoloring.
@@ -195,76 +190,6 @@ class RAGreedy : public MachineFunctionPass,
   static const char *const StageName[];
 #endif
 
-  // RegInfo - Keep additional information about each live range.
-  struct RegInfo {
-    LiveRangeStage Stage = RS_New;
-
-    // Cascade - Eviction loop prevention. See
-    // canEvictInterferenceBasedOnCost().
-    unsigned Cascade = 0;
-
-    RegInfo() = default;
-  };
-
-  IndexedMap<RegInfo, VirtReg2IndexFunctor> ExtraRegInfo;
-
-  LiveRangeStage getStage(Register Reg) const {
-    return ExtraRegInfo[Reg].Stage;
-  }
-
-  LiveRangeStage getStage(const LiveInterval &VirtReg) const {
-    return getStage(VirtReg.reg());
-  }
-
-  void setStage(Register Reg, LiveRangeStage Stage) {
-    ExtraRegInfo.resize(MRI->getNumVirtRegs());
-    ExtraRegInfo[Reg].Stage = Stage;
-  }
-
-  void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) {
-    setStage(VirtReg.reg(), Stage);
-  }
-
-  /// Return the current stage of the register, if present, otherwise initialize
-  /// it and return that.
-  LiveRangeStage getOrInitStage(Register Reg) {
-    ExtraRegInfo.grow(Reg);
-    return getStage(Reg);
-  }
-
-  unsigned getCascade(Register Reg) const { return ExtraRegInfo[Reg].Cascade; }
-
-  void setCascade(Register Reg, unsigned Cascade) {
-    ExtraRegInfo.resize(MRI->getNumVirtRegs());
-    ExtraRegInfo[Reg].Cascade = Cascade;
-  }
-
-  unsigned getOrAssignNewCascade(Register Reg) {
-    unsigned Cascade = getCascade(Reg);
-    if (!Cascade) {
-      Cascade = NextCascade++;
-      setCascade(Reg, Cascade);
-    }
-    return Cascade;
-  }
-
-  unsigned getCascadeOrCurrentNext(Register Reg) const {
-    unsigned Cascade = getCascade(Reg);
-    if (!Cascade)
-      Cascade = NextCascade;
-    return Cascade;
-  }
-
-  template<typename Iterator>
-  void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) {
-    ExtraRegInfo.resize(MRI->getNumVirtRegs());
-    for (;Begin != End; ++Begin) {
-      Register Reg = *Begin;
-      if (ExtraRegInfo[Reg].Stage == RS_New)
-        ExtraRegInfo[Reg].Stage = NewStage;
-    }
-  }
-
   /// EvictionTrack - Keeps track of past evictions in order to optimize region
   /// split decision.
   class EvictionTrack {
@@ -375,10 +300,6 @@ class RAGreedy : public MachineFunctionPass,
   /// Callee-save register cost, calculated once per machine function.
   BlockFrequency CSRCost;
 
-  /// Run or not the local reassignment heuristic. This information is
-  /// obtained from the TargetSubtargetInfo.
-  bool EnableLocalReassign;
-
   /// Enable or not the consideration of the cost of local intervals created
   /// by a split candidate when choosing the best split candidate.
   bool EnableAdvancedRASplitCost;
@@ -447,13 +368,6 @@ private:
   bool calcCompactRegion(GlobalSplitCandidate&);
   void splitAroundRegion(LiveRangeEdit&, ArrayRef<unsigned>);
   void calcGapWeights(MCRegister, SmallVectorImpl<float> &);
-  Register canReassign(LiveInterval &VirtReg, Register PrevReg) const;
-  bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool) const;
-  bool canEvictInterferenceBasedOnCost(LiveInterval &, MCRegister, bool,
-                                       EvictionCost &,
-                                       const SmallVirtRegSet &) const;
-  bool canEvictHintInterference(LiveInterval &, MCRegister,
-                                const SmallVirtRegSet &) const;
   bool canEvictInterferenceInRange(const LiveInterval &VirtReg,
                                    MCRegister PhysReg, SlotIndex Start,
                                    SlotIndex End, EvictionCost &MaxCost) const;
@@ -529,8 +443,6 @@ private:
   BlockFrequency getBrokenHintFreq(const HintsInfo &, MCRegister);
   void collectHintInfo(Register, HintsInfo &);
 
-  bool isUnusedCalleeSavedReg(MCRegister PhysReg) const;
-
   /// Greedy RA statistic to remark.
   struct RAGreedyStats {
     unsigned Reloads = 0;
@@ -597,6 +509,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
 INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
 INITIALIZE_PASS_DEPENDENCY(SpillPlacement)
 INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
+INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysis)
 INITIALIZE_PASS_END(RAGreedy, "greedy",
                 "Greedy Register Allocator", false, false)
 
@@ -663,6 +576,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<EdgeBundles>();
   AU.addRequired<SpillPlacement>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  AU.addRequired<RegAllocEvictionAdvisorAnalysis>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -696,22 +610,25 @@ void RAGreedy::LRE_WillShrinkVirtReg(Register VirtReg) {
 }
 
 void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) {
+  ExtraInfo->LRE_DidCloneVirtReg(New, Old);
+}
+
+void ExtraRegInfo::LRE_DidCloneVirtReg(Register New, Register Old) {
   // Cloning a register we haven't even heard about yet?  Just ignore it.
-  if (!ExtraRegInfo.inBounds(Old))
+  if (!Info.inBounds(Old))
     return;
 
   // LRE may clone a virtual register because dead code elimination causes it to
   // be split into connected components. The new components are much smaller
   // than the original, so they should get a new chance at being assigned.
   // same stage as the parent.
-  ExtraRegInfo[Old].Stage = RS_Assign;
-  ExtraRegInfo.grow(New);
-  ExtraRegInfo[New] = ExtraRegInfo[Old];
+  Info[Old].Stage = RS_Assign;
+  Info.grow(New.id());
+  Info[New] = Info[Old];
 }
 
 void RAGreedy::releaseMemory() {
   SpillerInstance.reset();
-  ExtraRegInfo.clear();
   GlobalCand.clear();
 }
 
@@ -725,10 +642,10 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
   assert(Reg.isVirtual() && "Can only enqueue virtual registers");
   unsigned Prio;
 
-  auto Stage = getOrInitStage(Reg);
+  auto Stage = ExtraInfo->getOrInitStage(Reg);
   if (Stage == RS_New) {
     Stage = RS_Assign;
-    setStage(Reg, Stage);
+    ExtraInfo->setStage(Reg, Stage);
   }
   if (Stage == RS_Split) {
     // Unsplit ranges that couldn't be allocated immediately are deferred until
@@ -824,7 +741,8 @@ MCRegister RAGreedy::tryAssign(LiveInterval &VirtReg,
       MCRegister PhysHint = Hint.asMCReg();
       LLVM_DEBUG(dbgs() << "missed hint " << printReg(PhysHint, TRI) << '\n');
 
-      if (canEvictHintInterference(VirtReg, PhysHint, FixedRegisters)) {
+      if (EvictAdvisor->canEvictHintInterference(VirtReg, PhysHint,
+                                                 FixedRegisters)) {
         evictInterference(VirtReg, PhysHint, NewVRegs);
         return PhysHint;
       }
@@ -850,7 +768,8 @@ MCRegister RAGreedy::tryAssign(LiveInterval &VirtReg,
 //                         Interference eviction
 //===----------------------------------------------------------------------===//
 
-Register RAGreedy::canReassign(LiveInterval &VirtReg, Register PrevReg) const {
+Register RegAllocEvictionAdvisor::canReassign(LiveInterval &VirtReg,
+                                              Register PrevReg) const {
   auto Order =
       AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix);
   MCRegister PhysReg;
@@ -889,9 +808,10 @@ Register RAGreedy::canReassign(LiveInterval &VirtReg, Register PrevReg) const {
 ///                   register.
 /// @param B          The live range to be evicted.
 /// @param BreaksHint True when B is already assigned to its preferred register.
-bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
-                           LiveInterval &B, bool BreaksHint) const {
-  bool CanSplit = getStage(B) < RS_Spill;
+bool DefaultEvictionAdvisor::shouldEvict(LiveInterval &A, bool IsHint,
+                                         LiveInterval &B,
+                                         bool BreaksHint) const {
+  bool CanSplit = ExtraInfo->getStage(B) < RS_Spill;
 
   // Be fairly aggressive about following hints as long as the evictee can be
   // split.
@@ -907,7 +827,7 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
 
 /// canEvictHintInterference - return true if the interference for VirtReg
 /// on the PhysReg, which is VirtReg's hint, can be evicted in favor of VirtReg.
-bool RAGreedy::canEvictHintInterference(
+bool DefaultEvictionAdvisor::canEvictHintInterference(
     LiveInterval &VirtReg, MCRegister PhysReg,
     const SmallVirtRegSet &FixedRegisters) const {
   EvictionCost MaxCost;
@@ -925,7 +845,7 @@ bool RAGreedy::canEvictHintInterference(
 /// @param MaxCost Only look for cheaper candidates and update with new cost
 ///                when returning true.
 /// @returns True when interference can be evicted cheaper than MaxCost.
-bool RAGreedy::canEvictInterferenceBasedOnCost(
+bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost(
     LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint,
     EvictionCost &MaxCost, const SmallVirtRegSet &FixedRegisters) const {
   // It is only possible to evict virtual register interference.
@@ -941,9 +861,7 @@ bool RAGreedy::canEvictInterferenceBasedOnCost(
   //
   // This works out so a register without a cascade number is allowed to evict
   // anything, and it can be evicted by anything.
-  unsigned Cascade = ExtraRegInfo[VirtReg.reg()].Cascade;
-  if (!Cascade)
-    Cascade = NextCascade;
+  unsigned Cascade = ExtraInfo->getCascadeOrCurrentNext(VirtReg.reg());
 
   EvictionCost Cost;
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
@@ -965,7 +883,7 @@ bool RAGreedy::canEvictInterferenceBasedOnCost(
         return false;
 
       // Never evict spill products. They cannot split or spill.
-      if (getStage(*Intf) == RS_Done)
+      if (ExtraInfo->getStage(*Intf) == RS_Done)
         return false;
       // Once a live range becomes small enough, it is urgent that we find a
       // register for it. This is indicated by an infinite spill weight. These
@@ -980,7 +898,7 @@ bool RAGreedy::canEvictInterferenceBasedOnCost(
                RegClassInfo.getNumAllocatableRegs(
                    MRI->getRegClass(Intf->reg())));
       // Only evict older cascades or live ranges without a cascade.
-      unsigned IntfCascade = ExtraRegInfo[Intf->reg()].Cascade;
+      unsigned IntfCascade = ExtraInfo->getCascade(Intf->reg());
       if (Cascade <= IntfCascade) {
         if (!Urgent)
           return false;
@@ -1043,7 +961,7 @@ bool RAGreedy::canEvictInterferenceInRange(const LiveInterval &VirtReg,
       if (!Register::isVirtualRegister(Intf->reg()))
         return false;
       // Never evict spill products. They cannot split or spill.
-      if (getStage(*Intf) == RS_Done)
+      if (ExtraInfo->getStage(*Intf) == RS_Done)
         return false;
 
       // Would this break a satisfied hint?
@@ -1106,7 +1024,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg,
   // Make sure that VirtReg has a cascade number, and assign that cascade
   // number to every evicted register. These live ranges than then only be
   // evicted by a newer cascade, preventing infinite loops.
-  unsigned Cascade = getOrAssignNewCascade(VirtReg.reg());
+  unsigned Cascade = ExtraInfo->getOrAssignNewCascade(VirtReg.reg());
 
   LLVM_DEBUG(dbgs() << "evicting " << printReg(PhysReg, TRI)
                     << " interference: Cascade " << Cascade << '\n');
@@ -1132,10 +1050,10 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg,
     LastEvicted.addEviction(PhysReg, VirtReg.reg(), Intf->reg());
 
     Matrix->unassign(*Intf);
-    assert((getCascade(Intf->reg()) < Cascade ||
+    assert((ExtraInfo->getCascade(Intf->reg()) < Cascade ||
             VirtReg.isSpillable() < Intf->isSpillable()) &&
            "Cannot decrease cascade number, illegal eviction");
-    setCascade(Intf->reg(), Cascade);
+    ExtraInfo->setCascade(Intf->reg(), Cascade);
     ++NumEvicted;
     NewVRegs.push_back(Intf->reg());
   }
@@ -1143,7 +1061,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg,
 
 /// Returns true if the given \p PhysReg is a callee saved register and has not
 /// been used for allocation yet.
-bool RAGreedy::isUnusedCalleeSavedReg(MCRegister PhysReg) const {
+bool RegAllocEvictionAdvisor::isUnusedCalleeSavedReg(MCRegister PhysReg) const {
   MCRegister CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg);
   if (!CSR)
     return false;
@@ -1151,7 +1069,7 @@ bool RAGreedy::isUnusedCalleeSavedReg(MCRegister PhysReg) const {
   return !Matrix->isPhysRegUsed(PhysReg);
 }
 
-MCRegister RAGreedy::tryFindEvictionCandidate(
+MCRegister DefaultEvictionAdvisor::tryFindEvictionCandidate(
     LiveInterval &VirtReg, const AllocationOrder &Order,
     uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const {
   // Keep track of the cheapest interference seen so far.
@@ -1225,8 +1143,8 @@ MCRegister RAGreedy::tryEvict(LiveInterval &VirtReg, AllocationOrder &Order,
   NamedRegionTimer T("evict", "Evict", TimerGroupName, TimerGroupDescription,
                      TimePassesIsEnabled);
 
-  MCRegister BestPhys =
-      tryFindEvictionCandidate(VirtReg, Order, CostPerUseLimit, FixedRegisters);
+  MCRegister BestPhys = EvictAdvisor->tryFindEvictionCandidate(
+      VirtReg, Order, CostPerUseLimit, FixedRegisters);
   if (BestPhys.isValid())
     evictInterference(VirtReg, BestPhys, NewVRegs);
   return BestPhys;
@@ -1769,8 +1687,8 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   // the ActiveBlocks list with each candidate. We need to filter out
   // duplicates.
   BitVector Todo = SA->getThroughBlocks();
-  for (unsigned c = 0; c != UsedCands.size(); ++c) {
-    ArrayRef<unsigned> Blocks = GlobalCand[UsedCands[c]].ActiveBlocks;
+  for (unsigned UsedCand : UsedCands) {
+    ArrayRef<unsigned> Blocks = GlobalCand[UsedCand].ActiveBlocks;
     for (unsigned Number : Blocks) {
       if (!Todo.test(Number))
         continue;
@@ -1817,13 +1735,13 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
     const LiveInterval &Reg = LIS->getInterval(LREdit.get(I));
 
     // Ignore old intervals from DCE.
-    if (getOrInitStage(Reg.reg()) != RS_New)
+    if (ExtraInfo->getOrInitStage(Reg.reg()) != RS_New)
       continue;
 
     // Remainder interval. Don't try splitting again, spill if it doesn't
     // allocate.
     if (IntvMap[I] == 0) {
-      setStage(Reg, RS_Spill);
+      ExtraInfo->setStage(Reg, RS_Spill);
       continue;
     }
 
@@ -1834,7 +1752,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
         LLVM_DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks
                           << " blocks as original.\n");
         // Don't allow repeated splitting as a safe guard against looping.
-        setStage(Reg, RS_Split2);
+        ExtraInfo->setStage(Reg, RS_Split2);
       }
       continue;
     }
@@ -1899,7 +1817,7 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
   unsigned BestCand = NoCand;
   for (MCPhysReg PhysReg : Order) {
     assert(PhysReg);
-    if (IgnoreCSR && isUnusedCalleeSavedReg(PhysReg))
+    if (IgnoreCSR && EvictAdvisor->isUnusedCalleeSavedReg(PhysReg))
       continue;
 
     // Discard bad candidates before we run out of interference cache cursors.
@@ -2065,8 +1983,8 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // goes straight to spilling, the new local ranges get to stay RS_New.
   for (unsigned I = 0, E = LREdit.size(); I != E; ++I) {
     const LiveInterval &LI = LIS->getInterval(LREdit.get(I));
-    if (getOrInitStage(LI.reg()) == RS_New && IntvMap[I] == 0)
-      setStage(LI, RS_Spill);
+    if (ExtraInfo->getOrInitStage(LI.reg()) == RS_New && IntvMap[I] == 0)
+      ExtraInfo->setStage(LI, RS_Spill);
   }
 
   if (VerifyEnabled)
@@ -2152,7 +2070,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SE->finish(&IntvMap);
   DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS);
   // Assign all new registers to RS_Spill. This was the last chance.
-  setStage(LREdit.begin(), LREdit.end(), RS_Spill);
+  ExtraInfo->setStage(LREdit.begin(), LREdit.end(), RS_Spill);
   return 0;
 }
 
@@ -2320,7 +2238,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // These rules allow a 3 -> 2+3 split once, which we need. They also prevent
   // excessive splitting and infinite loops.
   //
-  bool ProgressRequired = getStage(VirtReg) >= RS_Split2;
+  bool ProgressRequired = ExtraInfo->getStage(VirtReg) >= RS_Split2;
 
   // Best split candidate.
   unsigned BestBefore = NumGaps;
@@ -2456,7 +2374,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     assert(!ProgressRequired && "Didn't make progress when it was required.");
     for (unsigned I = 0, E = IntvMap.size(); I != E; ++I)
       if (IntvMap[I] == 1) {
-        setStage(LIS->getInterval(LREdit.get(I)), RS_Split2);
+        ExtraInfo->setStage(LIS->getInterval(LREdit.get(I)), RS_Split2);
         LLVM_DEBUG(dbgs() << ' ' << printReg(LREdit.get(I)));
       }
     LLVM_DEBUG(dbgs() << '\n');
@@ -2477,7 +2395,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
                             SmallVectorImpl<Register> &NewVRegs,
                             const SmallVirtRegSet &FixedRegisters) {
   // Ranges must be Split2 or less.
-  if (getStage(VirtReg) >= RS_Spill)
+  if (ExtraInfo->getStage(VirtReg) >= RS_Spill)
     return 0;
 
   // Local intervals are handled separately.
@@ -2499,7 +2417,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // First try to split around a region spanning multiple blocks. RS_Split2
   // ranges already made dubious progress with region splitting, so they go
   // straight to single block splitting.
-  if (getStage(VirtReg) < RS_Split2) {
+  if (ExtraInfo->getStage(VirtReg) < RS_Split2) {
     MCRegister PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
     if (PhysReg || !NewVRegs.empty())
       return PhysReg;
@@ -2551,7 +2469,7 @@ bool RAGreedy::mayRecolorAllInterferences(
       // it would not be recolorable as it is in the same state as VirtReg.
       // However, if VirtReg has tied defs and Intf doesn't, then
       // there is still a point in examining if it can be recolorable.
-      if (((getStage(*Intf) == RS_Done &&
+      if (((ExtraInfo->getStage(*Intf) == RS_Done &&
             MRI->getRegClass(Intf->reg()) == CurRC) &&
            !(hasTiedDef(MRI, VirtReg.reg()) &&
              !hasTiedDef(MRI, Intf->reg()))) ||
@@ -2615,7 +2533,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
 
   LLVM_DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n');
   // Ranges must be Done.
-  assert((getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) &&
+  assert((ExtraInfo->getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) &&
          "Last chance recoloring should really be last chance");
   // Set the max depth to LastChanceRecoloringMaxDepth.
   // We may want to reconsider that if we end up with a too large search space
@@ -2806,7 +2724,7 @@ MCRegister
 RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order,
                                 MCRegister PhysReg, uint8_t &CostPerUseLimit,
                                 SmallVectorImpl<Register> &NewVRegs) {
-  if (getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) {
+  if (ExtraInfo->getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) {
     // We choose spill over using the CSR for the first time if the spill cost
     // is lower than CSRCost.
     SA->analyze(&VirtReg);
@@ -2818,7 +2736,7 @@ RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order,
     CostPerUseLimit = 1;
     return 0;
   }
-  if (getStage(VirtReg) < RS_Split) {
+  if (ExtraInfo->getStage(VirtReg) < RS_Split) {
     // We choose pre-splitting over using the CSR for the first time if
     // the cost of splitting is lower than CSRCost.
     SA->analyze(&VirtReg);
@@ -3051,8 +2969,8 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
     // When NewVRegs is not empty, we may have made decisions such as evicting
     // a virtual register, go with the earlier decisions and use the physical
     // register.
-    if (CSRCost.getFrequency() && isUnusedCalleeSavedReg(PhysReg) &&
-        NewVRegs.empty()) {
+    if (CSRCost.getFrequency() &&
+        EvictAdvisor->isUnusedCalleeSavedReg(PhysReg) && NewVRegs.empty()) {
       MCRegister CSRReg = tryAssignCSRFirstTime(VirtReg, Order, PhysReg,
                                                 CostPerUseLimit, NewVRegs);
       if (CSRReg || !NewVRegs.empty())
@@ -3063,9 +2981,9 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
       return PhysReg;
   }
 
-  LiveRangeStage Stage = getStage(VirtReg);
+  LiveRangeStage Stage = ExtraInfo->getStage(VirtReg);
   LLVM_DEBUG(dbgs() << StageName[Stage] << " Cascade "
-                    << getCascade(VirtReg.reg()) << '\n');
+                    << ExtraInfo->getCascade(VirtReg.reg()) << '\n');
 
   // Try to evict a less worthy live range, but only for ranges from the primary
   // queue. The RS_Split ranges already failed to do this, and they should not
@@ -3094,7 +3012,7 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   // Wait until the second time, when all smaller ranges have been allocated.
   // This gives a better picture of the interference to split around.
   if (Stage < RS_Split) {
-    setStage(VirtReg, RS_Split);
+    ExtraInfo->setStage(VirtReg, RS_Split);
     LLVM_DEBUG(dbgs() << "wait for second round\n");
     NewVRegs.push_back(VirtReg.reg());
     return 0;
@@ -3120,12 +3038,12 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   // Finally spill VirtReg itself.
   if ((EnableDeferredSpilling ||
        TRI->shouldUseDeferredSpillingForVirtReg(*MF, VirtReg)) &&
-      getStage(VirtReg) < RS_Memory) {
+      ExtraInfo->getStage(VirtReg) < RS_Memory) {
     // TODO: This is experimental and in particular, we do not model
     // the live range splitting done by spilling correctly.
     // We would need a deep integration with the spiller to do the
     // right thing here. Anyway, that is still good for early testing.
-    setStage(VirtReg, RS_Memory);
+    ExtraInfo->setStage(VirtReg, RS_Memory);
     LLVM_DEBUG(dbgs() << "Do as if this register is in memory\n");
     NewVRegs.push_back(VirtReg.reg());
   } else {
@@ -3133,7 +3051,7 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
                        TimerGroupDescription, TimePassesIsEnabled);
     LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
     spiller().spill(LRE);
-    setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
+    ExtraInfo->setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
 
     // Tell LiveDebugVariables about the new ranges. Ranges not being covered by
     // the new regs are kept in LDV (still mapping to the old register), until
@@ -3316,10 +3234,6 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   TII = MF->getSubtarget().getInstrInfo();
   RCI.runOnMachineFunction(mf);
 
-  EnableLocalReassign = EnableLocalReassignment ||
-                        MF->getSubtarget().enableRALocalReassignment(
-                            MF->getTarget().getOptLevel());
-
   EnableAdvancedRASplitCost =
       ConsiderLocalIntervalCost.getNumOccurrences()
           ? ConsiderLocalIntervalCost
@@ -3354,8 +3268,9 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
 
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
   SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI, *VRAI));
-  ExtraRegInfo.clear();
-  NextCascade = 1;
+  ExtraInfo.emplace();
+  EvictAdvisor = getAnalysis<RegAllocEvictionAdvisorAnalysis>().getAdvisor(
+      *MF, Matrix, LIS, VRM, RegClassInfo, &*ExtraInfo);
   IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
   GlobalCand.resize(32);  // This will grow as needed.
   SetOfBrokenHints.clear();
diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp
index b22eb080791e..93be8f689d57 100644
--- a/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -623,8 +623,8 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
     // Compute an initial allowed set for the current vreg.
     std::vector<MCRegister> VRegAllowed;
     ArrayRef<MCPhysReg> RawPRegOrder = TRC->getRawAllocationOrder(MF);
-    for (unsigned I = 0; I != RawPRegOrder.size(); ++I) {
-      MCRegister PReg(RawPRegOrder[I]);
+    for (MCPhysReg R : RawPRegOrder) {
+      MCRegister PReg(R);
       if (MRI.isReserved(PReg))
         continue;
 
diff --git a/llvm/lib/CodeGen/RegAllocScore.cpp b/llvm/lib/CodeGen/RegAllocScore.cpp
new file mode 100644
index 000000000000..740890831617
--- /dev/null
+++ b/llvm/lib/CodeGen/RegAllocScore.cpp
@@ -0,0 +1,124 @@
+//===- RegAllocScore.cpp - evaluate regalloc policy quality ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// Calculate a measure of the register allocation policy quality. This is used
+/// to construct a reward for the training of the ML-driven allocation policy.
+/// Currently, the score is the sum of the machine basic block frequency-weighed
+/// number of loads, stores, copies, and remat instructions, each factored with
+/// a relative weight.
+//===----------------------------------------------------------------------===//
+
+#include "RegAllocScore.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstdint>
+#include <numeric>
+#include <vector>
+
+using namespace llvm;
+cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2), cl::Hidden);
+cl::opt<double> LoadWeight("regalloc-load-weight", cl::init(4.0), cl::Hidden);
+cl::opt<double> StoreWeight("regalloc-store-weight", cl::init(1.0), cl::Hidden);
+cl::opt<double> CheapRematWeight("regalloc-cheap-remat-weight", cl::init(0.2),
+                                 cl::Hidden);
+cl::opt<double> ExpensiveRematWeight("regalloc-expensive-remat-weight",
+                                     cl::init(1.0), cl::Hidden);
+#define DEBUG_TYPE "regalloc-score"
+
+RegAllocScore &RegAllocScore::operator+=(const RegAllocScore &Other) {
+  CopyCounts += Other.copyCounts();
+  LoadCounts += Other.loadCounts();
+  StoreCounts += Other.storeCounts();
+  LoadStoreCounts += Other.loadStoreCounts();
+  CheapRematCounts += Other.cheapRematCounts();
+  ExpensiveRematCounts += Other.expensiveRematCounts();
+  return *this;
+}
+
+bool RegAllocScore::operator==(const RegAllocScore &Other) const {
+  return copyCounts() == Other.copyCounts() &&
+         loadCounts() == Other.loadCounts() &&
+         storeCounts() == Other.storeCounts() &&
+         loadStoreCounts() == Other.loadStoreCounts() &&
+         cheapRematCounts() == Other.cheapRematCounts() &&
+         expensiveRematCounts() == Other.expensiveRematCounts();
+}
+
+bool RegAllocScore::operator!=(const RegAllocScore &Other) const {
+  return !(*this == Other);
+}
+
+double RegAllocScore::getScore() const {
+  double Ret = 0.0;
+  Ret += CopyWeight * copyCounts();
+  Ret += LoadWeight * loadCounts();
+  Ret += StoreWeight * storeCounts();
+  Ret += (LoadWeight + StoreWeight) * loadStoreCounts();
+  Ret += CheapRematWeight * cheapRematCounts();
+  Ret += ExpensiveRematWeight * expensiveRematCounts();
+
+  return Ret;
+}
+
+RegAllocScore
+llvm::calculateRegAllocScore(const MachineFunction &MF,
+                             const MachineBlockFrequencyInfo &MBFI,
+                             AAResults &AAResults) {
+  return calculateRegAllocScore(
+      MF,
+      [&](const MachineBasicBlock &MBB) {
+        return MBFI.getBlockFreqRelativeToEntryBlock(&MBB);
+      },
+      [&](const MachineInstr &MI) {
+        return MF.getSubtarget().getInstrInfo()->isTriviallyReMaterializable(
+            MI, &AAResults);
+      });
+}
+
+RegAllocScore llvm::calculateRegAllocScore(
+    const MachineFunction &MF,
+    llvm::function_ref<double(const MachineBasicBlock &)> GetBBFreq,
+    llvm::function_ref<bool(const MachineInstr &)>
+        IsTriviallyRematerializable) {
+  RegAllocScore Total;
+
+  for (const MachineBasicBlock &MBB : MF) {
+    double BlockFreqRelativeToEntrypoint = GetBBFreq(MBB);
+    RegAllocScore MBBScore;
+
+    for (const MachineInstr &MI : MBB) {
+      if (MI.isDebugInstr() || MI.isKill() || MI.isInlineAsm()) {
+        continue;
+      }
+      if (MI.isCopy()) {
+        MBBScore.onCopy(BlockFreqRelativeToEntrypoint);
+      } else if (IsTriviallyRematerializable(MI)) {
+        if (MI.getDesc().isAsCheapAsAMove()) {
+          MBBScore.onCheapRemat(BlockFreqRelativeToEntrypoint);
+        } else {
+          MBBScore.onExpensiveRemat(BlockFreqRelativeToEntrypoint);
+        }
+      } else if (MI.mayLoad() && MI.mayStore()) {
+        MBBScore.onLoadStore(BlockFreqRelativeToEntrypoint);
+      } else if (MI.mayLoad()) {
+        MBBScore.onLoad(BlockFreqRelativeToEntrypoint);
+      } else if (MI.mayStore()) {
+        MBBScore.onStore(BlockFreqRelativeToEntrypoint);
+      }
+    }
+    Total += MBBScore;
+  }
+  return Total;
+}
diff --git a/llvm/lib/CodeGen/RegAllocScore.h b/llvm/lib/CodeGen/RegAllocScore.h
new file mode 100644
index 000000000000..3c28bb61189d
--- /dev/null
+++ b/llvm/lib/CodeGen/RegAllocScore.h
@@ -0,0 +1,80 @@
+//==- RegAllocScore.h - evaluate regalloc policy quality  ----------*-C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// Calculate a measure of the register allocation policy quality. This is used
+/// to construct a reward for the training of the ML-driven allocation policy.
+/// Currently, the score is the sum of the machine basic block frequency-weighed
+/// number of loads, stores, copies, and remat instructions, each factored with
+/// a relative weight.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGALLOCSCORE_H_
+#define LLVM_CODEGEN_REGALLOCSCORE_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/Utils/TFUtils.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/Module.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
+
+namespace llvm {
+
+/// Regalloc score.
+class RegAllocScore final {
+  double CopyCounts = 0.0;
+  double LoadCounts = 0.0;
+  double StoreCounts = 0.0;
+  double CheapRematCounts = 0.0;
+  double LoadStoreCounts = 0.0;
+  double ExpensiveRematCounts = 0.0;
+
+public:
+  RegAllocScore() = default;
+  RegAllocScore(const RegAllocScore &) = default;
+
+  double copyCounts() const { return CopyCounts; }
+  double loadCounts() const { return LoadCounts; }
+  double storeCounts() const { return StoreCounts; }
+  double loadStoreCounts() const { return LoadStoreCounts; }
+  double expensiveRematCounts() const { return ExpensiveRematCounts; }
+  double cheapRematCounts() const { return CheapRematCounts; }
+
+  void onCopy(double Freq) { CopyCounts += Freq; }
+  void onLoad(double Freq) { LoadCounts += Freq; }
+  void onStore(double Freq) { StoreCounts += Freq; }
+  void onLoadStore(double Freq) { LoadStoreCounts += Freq; }
+  void onExpensiveRemat(double Freq) { ExpensiveRematCounts += Freq; }
+  void onCheapRemat(double Freq) { CheapRematCounts += Freq; }
+
+  RegAllocScore &operator+=(const RegAllocScore &Other);
+  bool operator==(const RegAllocScore &Other) const;
+  bool operator!=(const RegAllocScore &Other) const;
+  double getScore() const;
+};
+
+/// Calculate a score. When comparing 2 scores for the same function but
+/// different policies, the better policy would have a smaller score.
+/// The implementation is the overload below (which is also easily unittestable)
+RegAllocScore calculateRegAllocScore(const MachineFunction &MF,
+                                     const MachineBlockFrequencyInfo &MBFI,
+                                     AAResults &AAResults);
+
+/// Implementation of the above, which is also more easily unittestable.
+RegAllocScore calculateRegAllocScore(
+    const MachineFunction &MF,
+    llvm::function_ref<double(const MachineBasicBlock &)> GetBBFreq,
+    llvm::function_ref<bool(const MachineInstr &)> IsTriviallyRematerializable);
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_REGALLOCSCORE_H_
diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp
index 797899fb5b86..65a65b9cae95 100644
--- a/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -109,8 +109,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
   // FIXME: Once targets reserve registers instead of removing them from the
   // allocation order, we can simply use begin/end here.
   ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF);
-  for (unsigned i = 0; i != RawOrder.size(); ++i) {
-    unsigned PhysReg = RawOrder[i];
+  for (unsigned PhysReg : RawOrder) {
     // Remove reserved registers from the allocation order.
     if (Reserved.test(PhysReg))
       continue;
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 4c8534cf2d01..a917b0d27d4a 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -4067,13 +4067,13 @@ void RegisterCoalescer::joinAllIntervals() {
 
   // Coalesce intervals in MBB priority order.
   unsigned CurrDepth = std::numeric_limits<unsigned>::max();
-  for (unsigned i = 0, e = MBBs.size(); i != e; ++i) {
+  for (MBBPriorityInfo &MBB : MBBs) {
     // Try coalescing the collected local copies for deeper loops.
-    if (JoinGlobalCopies && MBBs[i].Depth < CurrDepth) {
+    if (JoinGlobalCopies && MBB.Depth < CurrDepth) {
       coalesceLocals();
-      CurrDepth = MBBs[i].Depth;
+      CurrDepth = MBB.Depth;
     }
-    copyCoalesceInMBB(MBBs[i].MBB);
+    copyCoalesceInMBB(MBB.MBB);
   }
   lateLiveIntervalUpdate();
   coalesceLocals();
diff --git a/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp b/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp
index de6129a912d3..49859aeec78b 100644
--- a/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp
+++ b/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp
@@ -159,20 +159,17 @@ static bool reduceDbgValsBackwardScan(MachineBasicBlock &MBB) {
   SmallVector<MachineInstr *, 8> DbgValsToBeRemoved;
   SmallDenseSet<DebugVariable> VariableSet;
 
-  for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
-       I != E; ++I) {
-    MachineInstr *MI = &*I;
-
-    if (MI->isDebugValue()) {
-      DebugVariable Var(MI->getDebugVariable(), MI->getDebugExpression(),
-                        MI->getDebugLoc()->getInlinedAt());
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
+    if (MI.isDebugValue()) {
+      DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                        MI.getDebugLoc()->getInlinedAt());
       auto R = VariableSet.insert(Var);
       // If it is a DBG_VALUE describing a constant as:
       //   DBG_VALUE 0, ...
       // we just don't consider such instructions as candidates
       // for redundant removal.
-      if (MI->isNonListDebugValue()) {
-        MachineOperand &Loc = MI->getDebugOperand(0);
+      if (MI.isNonListDebugValue()) {
+        MachineOperand &Loc = MI.getDebugOperand(0);
         if (!Loc.isReg()) {
           // If we have already encountered this variable, just stop
           // tracking it.
@@ -185,7 +182,7 @@ static bool reduceDbgValsBackwardScan(MachineBasicBlock &MBB) {
       // We have already encountered the value for this variable,
       // so this one can be deleted.
       if (!R.second)
-        DbgValsToBeRemoved.push_back(MI);
+        DbgValsToBeRemoved.push_back(&MI);
       continue;
     }
 
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 50d9d64bfcfd..3d8a7eecce18 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -521,8 +521,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
   StackLayout SSL(StackAlignment);
   if (StackGuardSlot) {
     Type *Ty = StackGuardSlot->getAllocatedType();
-    uint64_t Align =
-        std::max(DL.getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
+    Align Align = std::max(DL.getPrefTypeAlign(Ty), StackGuardSlot->getAlign());
     SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
                   Align, SSC.getFullLiveRange());
   }
@@ -534,8 +533,9 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
       Size = 1; // Don't create zero-sized stack objects.
 
     // Ensure the object is properly aligned.
-    uint64_t Align =
-        std::max(DL.getPrefTypeAlignment(Ty), Arg->getParamAlignment());
+    Align Align = DL.getPrefTypeAlign(Ty);
+    if (auto A = Arg->getParamAlign())
+      Align = std::max(Align, *A);
     SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange());
   }
 
@@ -546,24 +546,24 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
       Size = 1; // Don't create zero-sized stack objects.
 
     // Ensure the object is properly aligned.
-    uint64_t Align = std::max(DL.getPrefTypeAlignment(Ty), AI->getAlignment());
+    Align Align = std::max(DL.getPrefTypeAlign(Ty), AI->getAlign());
 
     SSL.addObject(AI, Size, Align,
                   ClColoring ? SSC.getLiveRange(AI) : NoColoringRange);
   }
 
   SSL.computeLayout();
-  uint64_t FrameAlignment = SSL.getFrameAlignment();
+  Align FrameAlignment = SSL.getFrameAlignment();
 
   // FIXME: tell SSL that we start at a less-then-MaxAlignment aligned location
   // (AlignmentSkew).
   if (FrameAlignment > StackAlignment) {
     // Re-align the base pointer according to the max requested alignment.
-    assert(isPowerOf2_64(FrameAlignment));
     IRB.SetInsertPoint(BasePointer->getNextNode());
     BasePointer = cast<Instruction>(IRB.CreateIntToPtr(
-        IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy),
-                      ConstantInt::get(IntPtrTy, ~uint64_t(FrameAlignment - 1))),
+        IRB.CreateAnd(
+            IRB.CreatePtrToInt(BasePointer, IntPtrTy),
+            ConstantInt::get(IntPtrTy, ~(FrameAlignment.value() - 1))),
         StackPtrTy));
   }
 
diff --git a/llvm/lib/CodeGen/SafeStackLayout.cpp b/llvm/lib/CodeGen/SafeStackLayout.cpp
index 7cdda7743c16..602afcfa9001 100644
--- a/llvm/lib/CodeGen/SafeStackLayout.cpp
+++ b/llvm/lib/CodeGen/SafeStackLayout.cpp
@@ -37,7 +37,7 @@ LLVM_DUMP_METHOD void StackLayout::print(raw_ostream &OS) {
   }
 }
 
-void StackLayout::addObject(const Value *V, unsigned Size, uint64_t Alignment,
+void StackLayout::addObject(const Value *V, unsigned Size, Align Alignment,
                             const StackLifetime::LiveRange &Range) {
   StackObjects.push_back({V, Size, Alignment, Range});
   ObjectAlignments[V] = Alignment;
@@ -45,7 +45,7 @@ void StackLayout::addObject(const Value *V, unsigned Size, uint64_t Alignment,
 }
 
 static unsigned AdjustStackOffset(unsigned Offset, unsigned Size,
-                                  uint64_t Alignment) {
+                                  Align Alignment) {
   return alignTo(Offset + Size, Alignment) - Size;
 }
 
@@ -62,7 +62,8 @@ void StackLayout::layoutObject(StackObject &Obj) {
   }
 
   LLVM_DEBUG(dbgs() << "Layout: size " << Obj.Size << ", align "
-                    << Obj.Alignment << ", range " << Obj.Range << "\n");
+                    << Obj.Alignment.value() << ", range " << Obj.Range
+                    << "\n");
   assert(Obj.Alignment <= MaxAlignment);
   unsigned Start = AdjustStackOffset(0, Obj.Size, Obj.Alignment);
   unsigned End = Start + Obj.Size;
diff --git a/llvm/lib/CodeGen/SafeStackLayout.h b/llvm/lib/CodeGen/SafeStackLayout.h
index b72450e57080..4ac7af2059f5 100644
--- a/llvm/lib/CodeGen/SafeStackLayout.h
+++ b/llvm/lib/CodeGen/SafeStackLayout.h
@@ -22,7 +22,7 @@ namespace safestack {
 
 /// Compute the layout of an unsafe stack frame.
 class StackLayout {
-  uint64_t MaxAlignment;
+  Align MaxAlignment;
 
   struct StackRegion {
     unsigned Start;
@@ -40,14 +40,14 @@ class StackLayout {
   struct StackObject {
     const Value *Handle;
     unsigned Size;
-    uint64_t Alignment;
+    Align Alignment;
     StackLifetime::LiveRange Range;
   };
 
   SmallVector<StackObject, 8> StackObjects;
 
   DenseMap<const Value *, unsigned> ObjectOffsets;
-  DenseMap<const Value *, uint64_t> ObjectAlignments;
+  DenseMap<const Value *, Align> ObjectAlignments;
 
   void layoutObject(StackObject &Obj);
 
@@ -56,7 +56,7 @@ public:
 
   /// Add an object to the stack frame. Value pointer is opaque and used as a
   /// handle to retrieve the object's offset in the frame later.
-  void addObject(const Value *V, unsigned Size, uint64_t Alignment,
+  void addObject(const Value *V, unsigned Size, Align Alignment,
                  const StackLifetime::LiveRange &Range);
 
   /// Run the layout computation for all previously added objects.
@@ -66,13 +66,13 @@ public:
   unsigned getObjectOffset(const Value *V) { return ObjectOffsets[V]; }
 
   /// Returns the alignment of the object
-  uint64_t getObjectAlignment(const Value *V) { return ObjectAlignments[V]; }
+  Align getObjectAlignment(const Value *V) { return ObjectAlignments[V]; }
 
   /// Returns the size of the entire frame.
   unsigned getFrameSize() { return Regions.empty() ? 0 : Regions.back().End; }
 
   /// Returns the alignment of the frame.
-  uint64_t getFrameAlignment() { return MaxAlignment; }
+  Align getFrameAlignment() { return MaxAlignment; }
 
   void print(raw_ostream &OS);
 };
diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp
index ef3afab2b730..696b29018ae6 100644
--- a/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -618,8 +618,8 @@ std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU,
   do {
     const SUnit *SU = WorkList.back();
     WorkList.pop_back();
-    for (int I = SU->Succs.size()-1; I >= 0; --I) {
-      const SUnit *Succ = SU->Succs[I].getSUnit();
+    for (const SDep &SD : llvm::reverse(SU->Succs)) {
+      const SUnit *Succ = SD.getSUnit();
       unsigned s = Succ->NodeNum;
       // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
       if (Succ->isBoundaryNode())
@@ -652,8 +652,8 @@ std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU,
   do {
     const SUnit *SU = WorkList.back();
     WorkList.pop_back();
-    for (int I = SU->Preds.size()-1; I >= 0; --I) {
-      const SUnit *Pred = SU->Preds[I].getSUnit();
+    for (const SDep &SD : llvm::reverse(SU->Preds)) {
+      const SUnit *Pred = SD.getSUnit();
       unsigned s = Pred->NodeNum;
       // Edges to non-SUnits are allowed but ignored (e.g. EntrySU).
       if (Pred->isBoundaryNode())
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index df5a041b87cd..067ad819e0d2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -487,10 +487,7 @@ namespace {
     SDValue visitFCEIL(SDNode *N);
     SDValue visitFTRUNC(SDNode *N);
     SDValue visitFFLOOR(SDNode *N);
-    SDValue visitFMINNUM(SDNode *N);
-    SDValue visitFMAXNUM(SDNode *N);
-    SDValue visitFMINIMUM(SDNode *N);
-    SDValue visitFMAXIMUM(SDNode *N);
+    SDValue visitFMinMax(SDNode *N);
     SDValue visitBRCOND(SDNode *N);
     SDValue visitBR_CC(SDNode *N);
     SDValue visitLOAD(SDNode *N);
@@ -1701,10 +1698,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FNEG:               return visitFNEG(N);
   case ISD::FABS:               return visitFABS(N);
   case ISD::FFLOOR:             return visitFFLOOR(N);
-  case ISD::FMINNUM:            return visitFMINNUM(N);
-  case ISD::FMAXNUM:            return visitFMAXNUM(N);
-  case ISD::FMINIMUM:           return visitFMINIMUM(N);
-  case ISD::FMAXIMUM:           return visitFMAXIMUM(N);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:           return visitFMinMax(N);
   case ISD::FCEIL:              return visitFCEIL(N);
   case ISD::FTRUNC:             return visitFTRUNC(N);
   case ISD::BRCOND:             return visitBRCOND(N);
@@ -2260,6 +2257,21 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
+  // fold (add x, undef) -> undef
+  if (N0.isUndef())
+    return N0;
+  if (N1.isUndef())
+    return N1;
+
+  // fold (add c1, c2) -> c1+c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
+
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
@@ -2268,23 +2280,6 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
     // fold (add x, 0) -> x, vector edition
     if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
       return N0;
-    if (ISD::isConstantSplatVectorAllZeros(N0.getNode()))
-      return N1;
-  }
-
-  // fold (add x, undef) -> undef
-  if (N0.isUndef())
-    return N0;
-
-  if (N1.isUndef())
-    return N1;
-
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
-    // canonicalize constant to RHS
-    if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
-      return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
-    // fold (add c1, c2) -> c1+c2
-    return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1});
   }
 
   // fold (add x, 0) -> x
@@ -2554,6 +2549,19 @@ SDValue DAGCombiner::visitADDSAT(SDNode *N) {
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
+  // fold (add_sat x, undef) -> -1
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getAllOnesConstant(DL, VT);
+
+  // fold (add_sat c1, c2) -> c3
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(Opcode, DL, VT, N1, N0);
+
   // fold vector ops
   if (VT.isVector()) {
     // TODO SimplifyVBinOp
@@ -2561,20 +2569,6 @@ SDValue DAGCombiner::visitADDSAT(SDNode *N) {
     // fold (add_sat x, 0) -> x, vector edition
     if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
       return N0;
-    if (ISD::isConstantSplatVectorAllZeros(N0.getNode()))
-      return N1;
-  }
-
-  // fold (add_sat x, undef) -> -1
-  if (N0.isUndef() || N1.isUndef())
-    return DAG.getAllOnesConstant(DL, VT);
-
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
-    // canonicalize constant to RHS
-    if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
-      return DAG.getNode(Opcode, DL, VT, N1, N0);
-    // fold (add_sat c1, c2) -> c3
-    return DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1});
   }
 
   // fold (add_sat x, 0) -> x
@@ -3260,6 +3254,15 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
+  // fold (sub x, x) -> 0
+  // FIXME: Refactor this and xor and other similar operations together.
+  if (N0 == N1)
+    return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
+
+  // fold (sub c1, c2) -> c3
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1}))
+    return C;
+
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
@@ -3270,15 +3273,6 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       return N0;
   }
 
-  // fold (sub x, x) -> 0
-  // FIXME: Refactor this and xor and other similar operations together.
-  if (N0 == N1)
-    return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
-
-  // fold (sub c1, c2) -> c3
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1}))
-    return C;
-
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -3611,15 +3605,6 @@ SDValue DAGCombiner::visitSUBSAT(SDNode *N) {
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
-  // fold vector ops
-  if (VT.isVector()) {
-    // TODO SimplifyVBinOp
-
-    // fold (sub_sat x, 0) -> x, vector edition
-    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
-      return N0;
-  }
-
   // fold (sub_sat x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, DL, VT);
@@ -3632,6 +3617,15 @@ SDValue DAGCombiner::visitSUBSAT(SDNode *N) {
   if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1}))
     return C;
 
+  // fold vector ops
+  if (VT.isVector()) {
+    // TODO SimplifyVBinOp
+
+    // fold (sub_sat x, 0) -> x, vector edition
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+      return N0;
+  }
+
   // fold (sub_sat x, 0) -> x
   if (isNullConstant(N1))
     return N0;
@@ -3781,6 +3775,15 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
 
+  // fold (mul c1, c2) -> c1*c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS (vector doesn't have to splat)
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
+
   bool N1IsConst = false;
   bool N1IsOpaqueConst = false;
   APInt ConstValue1;
@@ -3802,15 +3805,6 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     }
   }
 
-  // fold (mul c1, c2) -> c1*c2
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1}))
-    return C;
-
-  // canonicalize constant to RHS (vector doesn't have to splat)
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
-     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
-
   // fold (mul x, 0) -> 0
   if (N1IsConst && ConstValue1.isZero())
     return N1;
@@ -4140,17 +4134,17 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   EVT CCVT = getSetCCResultType(VT);
   SDLoc DL(N);
 
+  // fold (sdiv c1, c2) -> c1/c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1}))
+    return C;
+
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
-  // fold (sdiv c1, c2) -> c1/c2
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1}))
-    return C;
-
   // fold (sdiv X, -1) -> 0-X
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N1C && N1C->isAllOnes())
     return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);
 
@@ -4284,17 +4278,17 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
   EVT CCVT = getSetCCResultType(VT);
   SDLoc DL(N);
 
+  // fold (udiv c1, c2) -> c1/c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1}))
+    return C;
+
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
-  // fold (udiv c1, c2) -> c1/c2
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1}))
-    return C;
-
   // fold (udiv X, -1) -> select(X == -1, 1, 0)
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N1C && N1C->isAllOnes())
     return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
                          DAG.getConstant(1, DL, VT),
@@ -4463,6 +4457,15 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
+  // fold (mulhs c1, c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHS, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::MULHS, DL, N->getVTList(), N1, N0);
+
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
@@ -4474,15 +4477,6 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
       return DAG.getConstant(0, DL, VT);
   }
 
-  // fold (mulhs c1, c2)
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHS, DL, VT, {N0, N1}))
-    return C;
-
-  // canonicalize constant to RHS.
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
-      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(ISD::MULHS, DL, N->getVTList(), N1, N0);
-
   // fold (mulhs x, 0) -> 0
   if (isNullConstant(N1))
     return N1;
@@ -4523,6 +4517,15 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
+  // fold (mulhu c1, c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHU, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::MULHU, DL, N->getVTList(), N1, N0);
+
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
@@ -4534,15 +4537,6 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
       return DAG.getConstant(0, DL, VT);
   }
 
-  // fold (mulhu c1, c2)
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHU, DL, VT, {N0, N1}))
-    return C;
-
-  // canonicalize constant to RHS.
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
-      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(ISD::MULHU, DL, N->getVTList(), N1, N0);
-
   // fold (mulhu x, 0) -> 0
   if (isNullConstant(N1))
     return N1;
@@ -4786,12 +4780,14 @@ SDValue DAGCombiner::visitMULO(SDNode *N) {
 }
 
 // Function to calculate whether the Min/Max pair of SDNodes (potentially
-// swapped around) make a signed saturate pattern, clamping to between -2^(BW-1)
-// and 2^(BW-1)-1. Returns the node being clamped and the bitwidth of the clamp
-// in BW. Should work with both SMIN/SMAX nodes and setcc/select combo. The
-// operands are the same as SimplifySelectCC. N0<N1 ? N2 : N3
+// swapped around) make a signed saturate pattern, clamping to between a signed
+// saturate of -2^(BW-1) and 2^(BW-1)-1, or an unsigned saturate of 0 and 2^BW.
+// Returns the node being clamped and the bitwidth of the clamp in BW. Should
+// work with both SMIN/SMAX nodes and setcc/select combo. The operands are the
+// same as SimplifySelectCC. N0<N1 ? N2 : N3.
 static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
-                                  SDValue N3, ISD::CondCode CC, unsigned &BW) {
+                                  SDValue N3, ISD::CondCode CC, unsigned &BW,
+                                  bool &Unsigned) {
   auto isSignedMinMax = [&](SDValue N0, SDValue N1, SDValue N2, SDValue N3,
                             ISD::CondCode CC) {
     // The compare and select operand should be the same or the select operands
@@ -4858,17 +4854,27 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
   const APInt &MinC = MinCOp->getAPIntValue();
   const APInt &MaxC = MaxCOp->getAPIntValue();
   APInt MinCPlus1 = MinC + 1;
-  if (-MaxC != MinCPlus1 || !MinCPlus1.isPowerOf2())
-    return SDValue();
-  BW = MinCPlus1.exactLogBase2() + 1;
-  return N02;
+  if (-MaxC == MinCPlus1 && MinCPlus1.isPowerOf2()) {
+    BW = MinCPlus1.exactLogBase2() + 1;
+    Unsigned = false;
+    return N02;
+  }
+
+  if (MaxC == 0 && MinCPlus1.isPowerOf2()) {
+    BW = MinCPlus1.exactLogBase2();
+    Unsigned = true;
+    return N02;
+  }
+
+  return SDValue();
 }
 
 static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
                                            SDValue N3, ISD::CondCode CC,
                                            SelectionDAG &DAG) {
   unsigned BW;
-  SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW);
+  bool Unsigned;
+  SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW, Unsigned);
   if (!Fp || Fp.getOpcode() != ISD::FP_TO_SINT)
     return SDValue();
   EVT FPVT = Fp.getOperand(0).getValueType();
@@ -4876,13 +4882,14 @@ static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
   if (FPVT.isVector())
     NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT,
                              FPVT.getVectorElementCount());
-  if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(
-          ISD::FP_TO_SINT_SAT, Fp.getOperand(0).getValueType(), NewVT))
+  unsigned NewOpc = Unsigned ? ISD::FP_TO_UINT_SAT : ISD::FP_TO_SINT_SAT;
+  if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(NewOpc, FPVT, NewVT))
     return SDValue();
   SDLoc DL(Fp);
-  SDValue Sat = DAG.getNode(ISD::FP_TO_SINT_SAT, DL, NewVT, Fp.getOperand(0),
+  SDValue Sat = DAG.getNode(NewOpc, DL, NewVT, Fp.getOperand(0),
                             DAG.getValueType(NewVT.getScalarType()));
-  return DAG.getSExtOrTrunc(Sat, DL, N2->getValueType(0));
+  return Unsigned ? DAG.getZExtOrTrunc(Sat, DL, N2->getValueType(0))
+                  : DAG.getSExtOrTrunc(Sat, DL, N2->getValueType(0));
 }
 
 SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
@@ -4892,11 +4899,6 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
   unsigned Opcode = N->getOpcode();
   SDLoc DL(N);
 
-  // fold vector ops
-  if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
-      return FoldedVOp;
-
   // fold operation with constant operands.
   if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
     return C;
@@ -4904,7 +4906,12 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       !DAG.isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(N->getOpcode(), DL, VT, N1, N0);
+    return DAG.getNode(Opcode, DL, VT, N1, N0);
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
 
   // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.
   // Only do this if the current op isn't legal and the flipped is.
@@ -5777,6 +5784,15 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (N0 == N1)
     return N0;
 
+  // fold (and c1, c2) -> c1&c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
+
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
@@ -5824,22 +5840,13 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
 
-  // fold (and c1, c2) -> c1&c2
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1}))
-    return C;
-
-  // canonicalize constant to RHS
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
-      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
-
   // fold (and x, -1) -> x
   if (isAllOnesConstant(N1))
     return N0;
 
   // if (and x, c) is known to be zero, return 0
   unsigned BitWidth = VT.getScalarSizeInBits();
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(BitWidth)))
     return DAG.getConstant(0, SDLoc(N), VT);
 
@@ -6546,21 +6553,25 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   if (N0 == N1)
     return N0;
 
+  // fold (or c1, c2) -> c1|c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
+
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
       return FoldedVOp;
 
     // fold (or x, 0) -> x, vector edition
-    if (ISD::isConstantSplatVectorAllZeros(N0.getNode()))
-      return N1;
     if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
       return N0;
 
     // fold (or x, -1) -> -1, vector edition
-    if (ISD::isConstantSplatVectorAllOnes(N0.getNode()))
-      // do not return N0, because undef node may exist in N0
-      return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType());
     if (ISD::isConstantSplatVectorAllOnes(N1.getNode()))
       // do not return N1, because undef node may exist in N1
       return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());
@@ -6629,16 +6640,6 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     }
   }
 
-  // fold (or c1, c2) -> c1|c2
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1}))
-    return C;
-
-  // canonicalize constant to RHS
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
-     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
-
   // fold (or x, 0) -> x
   if (isNullConstant(N1))
     return N0;
@@ -6651,6 +6652,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     return NewSel;
 
   // fold (or x, c) -> c iff (x & ~c) == 0
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
     return N1;
 
@@ -7941,18 +7943,6 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
-  // fold vector ops
-  if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
-      return FoldedVOp;
-
-    // fold (xor x, 0) -> x, vector edition
-    if (ISD::isConstantSplatVectorAllZeros(N0.getNode()))
-      return N1;
-    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
-      return N0;
-  }
-
   // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
   if (N0.isUndef() && N1.isUndef())
     return DAG.getConstant(0, DL, VT);
@@ -7969,9 +7959,19 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
 
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
-     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
 
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (xor x, 0) -> x, vector edition
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+      return N0;
+  }
+
   // fold (xor x, 0) -> x
   if (isNullConstant(N1))
     return N0;
@@ -8409,6 +8409,10 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   EVT ShiftVT = N1.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
+  // fold (shl c1, c2) -> c1<<c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1}))
+    return C;
+
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
@@ -8434,12 +8438,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     }
   }
 
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
-
-  // fold (shl c1, c2) -> c1<<c2
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1}))
-    return C;
-
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -8558,6 +8556,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl (sr[la] exact X,  C1), C2) -> (shl    X, (C2-C1)) if C1 <= C2
   // fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1  > C2
   // TODO - support non-uniform vector shift amounts.
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) &&
       N0->getFlags().hasExact()) {
     if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
@@ -8758,6 +8757,10 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
+  // fold (sra c1, c2) -> (sra c1, c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1}))
+    return C;
+
   // Arithmetic shifting an all-sign-bit value is a no-op.
   // fold (sra 0, x) -> 0
   // fold (sra -1, x) -> -1
@@ -8769,17 +8772,12 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
       return FoldedVOp;
 
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
-
-  // fold (sra c1, c2) -> (sra c1, c2)
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1}))
-    return C;
-
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
   // sext_inreg.
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
     unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
     EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
@@ -8962,21 +8960,20 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
+  // fold (srl c1, c2) -> c1 >>u c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1}))
+    return C;
+
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
       return FoldedVOp;
 
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
-
-  // fold (srl c1, c2) -> c1 >>u c2
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1}))
-    return C;
-
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // if (srl x, c) is known to be zero, return 0
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N1C &&
       DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits)))
     return DAG.getConstant(0, SDLoc(N), VT);
@@ -10043,6 +10040,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
   MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
   SDValue Mask = MST->getMask();
   SDValue Chain = MST->getChain();
+  SDValue Value = MST->getValue();
+  SDValue Ptr = MST->getBasePtr();
   SDLoc DL(N);
 
   // Zap masked stores with a zero mask.
@@ -10054,12 +10053,50 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
   if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MST->isUnindexed() &&
       !MST->isCompressingStore() && !MST->isTruncatingStore())
     return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(),
-                        MST->getBasePtr(), MST->getMemOperand());
+                        MST->getBasePtr(), MST->getPointerInfo(),
+                        MST->getOriginalAlign(), MachineMemOperand::MOStore,
+                        MST->getAAInfo());
 
   // Try transforming N to an indexed store.
   if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
     return SDValue(N, 0);
 
+  if (MST->isTruncatingStore() && MST->isUnindexed() &&
+      Value.getValueType().isInteger() &&
+      (!isa<ConstantSDNode>(Value) ||
+       !cast<ConstantSDNode>(Value)->isOpaque())) {
+    APInt TruncDemandedBits =
+        APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
+                             MST->getMemoryVT().getScalarSizeInBits());
+
+    // See if we can simplify the operation with
+    // SimplifyDemandedBits, which only works if the value has a single use.
+    if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
+      // Re-visit the store if anything changed and the store hasn't been merged
+      // with another node (N is deleted) SimplifyDemandedBits will add Value's
+      // node back to the worklist if necessary, but we also need to re-visit
+      // the Store node itself.
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
+      return SDValue(N, 0);
+    }
+  }
+
+  // If this is a TRUNC followed by a masked store, fold this into a masked
+  // truncating store.  We can do this even if this is already a masked
+  // truncstore.
+  if ((Value.getOpcode() == ISD::TRUNCATE) && Value.getNode()->hasOneUse() &&
+      MST->isUnindexed() &&
+      TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
+                               MST->getMemoryVT(), LegalOperations)) {
+    auto Mask = TLI.promoteTargetBoolean(DAG, MST->getMask(),
+                                         Value.getOperand(0).getValueType());
+    return DAG.getMaskedStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                              MST->getOffset(), Mask, MST->getMemoryVT(),
+                              MST->getMemOperand(), MST->getAddressingMode(),
+                              /*IsTruncating=*/true);
+  }
+
   return SDValue();
 }
 
@@ -10109,8 +10146,10 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
   // FIXME: Can we do this for indexed, expanding, or extending loads?
   if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MLD->isUnindexed() &&
       !MLD->isExpandingLoad() && MLD->getExtensionType() == ISD::NON_EXTLOAD) {
-    SDValue NewLd = DAG.getLoad(N->getValueType(0), SDLoc(N), MLD->getChain(),
-                                MLD->getBasePtr(), MLD->getMemOperand());
+    SDValue NewLd = DAG.getLoad(
+        N->getValueType(0), SDLoc(N), MLD->getChain(), MLD->getBasePtr(),
+        MLD->getPointerInfo(), MLD->getOriginalAlign(),
+        MachineMemOperand::MOLoad, MLD->getAAInfo(), MLD->getRanges());
     return CombineTo(N, NewLd, NewLd.getValue(1));
   }
 
@@ -13876,19 +13915,19 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
     return R;
 
-  // fold vector ops
-  if (VT.isVector())
-    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
-      return FoldedVOp;
-
   // fold (fadd c1, c2) -> c1 + c2
-  if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FADD, DL, VT, N0, N1);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FADD, DL, VT, {N0, N1}))
+    return C;
 
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
     return DAG.getNode(ISD::FADD, DL, VT, N1, N0);
 
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
   // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
   ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
   if (N1C && N1C->isZero())
@@ -14084,15 +14123,15 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
     return R;
 
+  // fold (fsub c1, c2) -> c1-c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FSUB, DL, VT, {N0, N1}))
+    return C;
+
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
-  // fold (fsub c1, c2) -> c1-c2
-  if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FSUB, DL, VT, N0, N1);
-
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -14157,7 +14196,6 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 SDValue DAGCombiner::visitFMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true);
   ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
@@ -14168,22 +14206,20 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
     return R;
 
-  // fold vector ops
-  if (VT.isVector()) {
-    // This just handles C1 * C2 for vectors. Other vector folds are below.
-    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
-      return FoldedVOp;
-  }
-
   // fold (fmul c1, c2) -> c1*c2
-  if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FMUL, DL, VT, N0, N1);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FMUL, DL, VT, {N0, N1}))
+    return C;
 
   // canonicalize constant to RHS
   if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
      !DAG.isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMUL, DL, VT, N1, N0);
 
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -14495,8 +14531,6 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
 SDValue DAGCombiner::visitFDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
@@ -14506,15 +14540,15 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
     return R;
 
+  // fold (fdiv c1, c2) -> c1/c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FDIV, DL, VT, {N0, N1}))
+    return C;
+
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
-  // fold (fdiv c1, c2) -> c1/c2
-  if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1);
-
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -14523,7 +14557,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
 
   if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
     // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
-    if (N1CFP) {
+    if (auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1)) {
       // Compute the reciprocal 1.0 / c2.
       const APFloat &N1APF = N1CFP->getValueAPF();
       APFloat Recip(N1APF.getSemantics(), 1); // 1.0
@@ -14639,8 +14673,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
 SDValue DAGCombiner::visitFREM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
   SDNodeFlags Flags = N->getFlags();
   SelectionDAG::FlagInserter FlagsInserter(DAG, N);
@@ -14649,9 +14681,9 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
     return R;
 
   // fold (frem c1, c2) -> fmod(c1,c2)
-  if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1);
-
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FREM, SDLoc(N), VT, {N0, N1}))
+    return C;
+  
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -14712,12 +14744,12 @@ static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0);
-  bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
   EVT VT = N->getValueType(0);
 
-  if (N0CFP && N1CFP) // Constant fold
-    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1);
+  // fold (fcopysign c1, c2) -> fcopysign(c1,c2)
+  if (SDValue C =
+          DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, SDLoc(N), VT, {N0, N1}))
+    return C;
 
   if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
     const APFloat &V = N1C->getValueAPF();
@@ -14835,14 +14867,6 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
 
 static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG,
                                const TargetLowering &TLI) {
-  // This optimization is guarded by a function attribute because it may produce
-  // unexpected results. Ie, programs may be relying on the platform-specific
-  // undefined behavior when the float-to-int conversion overflows.
-  const Function &F = DAG.getMachineFunction().getFunction();
-  Attribute StrictOverflow = F.getFnAttribute("strict-float-cast-overflow");
-  if (StrictOverflow.getValueAsString().equals("false"))
-    return SDValue();
-
   // We only do this if the target has legal ftrunc. Otherwise, we'd likely be
   // replacing casts with a libcall. We also must be allowed to ignore -0.0
   // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
@@ -15216,31 +15240,26 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   return SDValue();
 }
 
-static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N,
-                            APFloat (*Op)(const APFloat &, const APFloat &)) {
+SDValue DAGCombiner::visitFMinMax(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
-  const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
-  const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
   const SDNodeFlags Flags = N->getFlags();
   unsigned Opc = N->getOpcode();
   bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM;
   bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM;
   SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
-  if (N0CFP && N1CFP) {
-    const APFloat &C0 = N0CFP->getValueAPF();
-    const APFloat &C1 = N1CFP->getValueAPF();
-    return DAG.getConstantFP(Op(C0, C1), SDLoc(N), VT);
-  }
+  // Constant fold.
+  if (SDValue C = DAG.FoldConstantArithmetic(Opc, SDLoc(N), VT, {N0, N1}))
+    return C;
 
   // Canonicalize to constant on RHS.
   if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
       !DAG.isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
 
-  if (N1CFP) {
+  if (const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1)) {
     const APFloat &AF = N1CFP->getValueAPF();
 
     // minnum(X, nan) -> X
@@ -15272,22 +15291,6 @@ static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N,
   return SDValue();
 }
 
-SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
-  return visitFMinMax(DAG, N, minnum);
-}
-
-SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
-  return visitFMinMax(DAG, N, maxnum);
-}
-
-SDValue DAGCombiner::visitFMINIMUM(SDNode *N) {
-  return visitFMinMax(DAG, N, minimum);
-}
-
-SDValue DAGCombiner::visitFMAXIMUM(SDNode *N) {
-  return visitFMinMax(DAG, N, maximum);
-}
-
 SDValue DAGCombiner::visitFABS(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -18392,8 +18395,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
     if (ST->isUnindexed() && ST->isSimple() &&
         ST1->isUnindexed() && ST1->isSimple()) {
-      if (ST1->getBasePtr() == Ptr && ST1->getValue() == Value &&
-          ST->getMemoryVT() == ST1->getMemoryVT() &&
+      if (OptLevel != CodeGenOpt::None && ST1->getBasePtr() == Ptr &&
+          ST1->getValue() == Value && ST->getMemoryVT() == ST1->getMemoryVT() &&
           ST->getAddressSpace() == ST1->getAddressSpace()) {
         // If this is a store followed by a store with the same value to the
         // same location, then the store is dead/noop.
@@ -20727,6 +20730,156 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
   return NewLd;
 }
 
+/// Given  EXTRACT_SUBVECTOR(VECTOR_SHUFFLE(Op0, Op1, Mask)),
+/// try to produce  VECTOR_SHUFFLE(EXTRACT_SUBVECTOR(Op?, ?),
+///                                EXTRACT_SUBVECTOR(Op?, ?),
+///                                Mask'))
+/// iff it is legal and profitable to do so. Notably, the trimmed mask
+/// (containing only the elements that are extracted)
+/// must reference at most two subvectors.
+static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N,
+                                                     SelectionDAG &DAG,
+                                                     const TargetLowering &TLI,
+                                                     bool LegalOperations) {
+  assert(N->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+         "Must only be called on EXTRACT_SUBVECTOR's");
+
+  SDValue N0 = N->getOperand(0);
+
+  // Only deal with non-scalable vectors.
+  EVT NarrowVT = N->getValueType(0);
+  EVT WideVT = N0.getValueType();
+  if (!NarrowVT.isFixedLengthVector() || !WideVT.isFixedLengthVector())
+    return SDValue();
+
+  // The operand must be a shufflevector.
+  auto *WideShuffleVector = dyn_cast<ShuffleVectorSDNode>(N0);
+  if (!WideShuffleVector)
+    return SDValue();
+
+  // The old shuffleneeds to go away.
+  if (!WideShuffleVector->hasOneUse())
+    return SDValue();
+
+  // And the narrow shufflevector that we'll form must be legal.
+  if (LegalOperations &&
+      !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, NarrowVT))
+    return SDValue();
+
+  uint64_t FirstExtractedEltIdx = N->getConstantOperandVal(1);
+  int NumEltsExtracted = NarrowVT.getVectorNumElements();
+  assert((FirstExtractedEltIdx % NumEltsExtracted) == 0 &&
+         "Extract index is not a multiple of the output vector length.");
+
+  int WideNumElts = WideVT.getVectorNumElements();
+
+  SmallVector<int, 16> NewMask;
+  NewMask.reserve(NumEltsExtracted);
+  SmallSetVector<std::pair<SDValue /*Op*/, int /*SubvectorIndex*/>, 2>
+      DemandedSubvectors;
+
+  // Try to decode the wide mask into narrow mask from at most two subvectors.
+  for (int M : WideShuffleVector->getMask().slice(FirstExtractedEltIdx,
+                                                  NumEltsExtracted)) {
+    assert((M >= -1) && (M < (2 * WideNumElts)) &&
+           "Out-of-bounds shuffle mask?");
+
+    if (M < 0) {
+      // Does not depend on operands, does not require adjustment.
+      NewMask.emplace_back(M);
+      continue;
+    }
+
+    // From which operand of the shuffle does this shuffle mask element pick?
+    int WideShufOpIdx = M / WideNumElts;
+    // Which element of that operand is picked?
+    int OpEltIdx = M % WideNumElts;
+
+    assert((OpEltIdx + WideShufOpIdx * WideNumElts) == M &&
+           "Shuffle mask vector decomposition failure.");
+
+    // And which NumEltsExtracted-sized subvector of that operand is that?
+    int OpSubvecIdx = OpEltIdx / NumEltsExtracted;
+    // And which element within that subvector of that operand is that?
+    int OpEltIdxInSubvec = OpEltIdx % NumEltsExtracted;
+
+    assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted) == OpEltIdx &&
+           "Shuffle mask subvector decomposition failure.");
+
+    assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted +
+            WideShufOpIdx * WideNumElts) == M &&
+           "Shuffle mask full decomposition failure.");
+
+    SDValue Op = WideShuffleVector->getOperand(WideShufOpIdx);
+
+    if (Op.isUndef()) {
+      // Picking from an undef operand. Let's adjust mask instead.
+      NewMask.emplace_back(-1);
+      continue;
+    }
+
+    // Profitability check: only deal with extractions from the first subvector.
+    if (OpSubvecIdx != 0)
+      return SDValue();
+
+    const std::pair<SDValue, int> DemandedSubvector =
+        std::make_pair(Op, OpSubvecIdx);
+
+    if (DemandedSubvectors.insert(DemandedSubvector)) {
+      if (DemandedSubvectors.size() > 2)
+        return SDValue(); // We can't handle more than two subvectors.
+      // How many elements into the WideVT does this subvector start?
+      int Index = NumEltsExtracted * OpSubvecIdx;
+      // Bail out if the extraction isn't going to be cheap.
+      if (!TLI.isExtractSubvectorCheap(NarrowVT, WideVT, Index))
+        return SDValue();
+    }
+
+    // Ok, but from which operand of the new shuffle will this element pick?
+    int NewOpIdx =
+        getFirstIndexOf(DemandedSubvectors.getArrayRef(), DemandedSubvector);
+    assert((NewOpIdx == 0 || NewOpIdx == 1) && "Unexpected operand index.");
+
+    int AdjM = OpEltIdxInSubvec + NewOpIdx * NumEltsExtracted;
+    NewMask.emplace_back(AdjM);
+  }
+  assert(NewMask.size() == (unsigned)NumEltsExtracted && "Produced bad mask.");
+  assert(DemandedSubvectors.size() <= 2 &&
+         "Should have ended up demanding at most two subvectors.");
+
+  // Did we discover that the shuffle does not actually depend on operands?
+  if (DemandedSubvectors.empty())
+    return DAG.getUNDEF(NarrowVT);
+
+  // We still perform the exact same EXTRACT_SUBVECTOR,  just on different
+  // operand[s]/index[es], so there is no point in checking for it's legality.
+
+  // Do not turn a legal shuffle into an illegal one.
+  if (TLI.isShuffleMaskLegal(WideShuffleVector->getMask(), WideVT) &&
+      !TLI.isShuffleMaskLegal(NewMask, NarrowVT))
+    return SDValue();
+
+  SDLoc DL(N);
+
+  SmallVector<SDValue, 2> NewOps;
+  for (const std::pair<SDValue /*Op*/, int /*SubvectorIndex*/>
+           &DemandedSubvector : DemandedSubvectors) {
+    // How many elements into the WideVT does this subvector start?
+    int Index = NumEltsExtracted * DemandedSubvector.second;
+    SDValue IndexC = DAG.getVectorIdxConstant(Index, DL);
+    NewOps.emplace_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowVT,
+                                    DemandedSubvector.first, IndexC));
+  }
+  assert((NewOps.size() == 1 || NewOps.size() == 2) &&
+         "Should end up with either one or two ops");
+
+  // If we ended up with only one operand, pad with an undef.
+  if (NewOps.size() == 1)
+    NewOps.emplace_back(DAG.getUNDEF(NarrowVT));
+
+  return DAG.getVectorShuffle(NarrowVT, DL, NewOps[0], NewOps[1], NewMask);
+}
+
 SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
   EVT NVT = N->getValueType(0);
   SDValue V = N->getOperand(0);
@@ -20840,6 +20993,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     }
   }
 
+  if (SDValue V =
+          foldExtractSubvectorFromShuffleVector(N, DAG, TLI, LegalOperations))
+    return V;
+
   V = peekThroughBitcasts(V);
 
   // If the input is a build vector. Try to make a smaller build vector.
@@ -22424,15 +22581,9 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N, const SDLoc &DL) {
 
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
-  SDValue Ops[] = {LHS, RHS};
   unsigned Opcode = N->getOpcode();
   SDNodeFlags Flags = N->getFlags();
 
-  // See if we can constant fold the vector operation.
-  if (SDValue Fold = DAG.FoldConstantArithmetic(Opcode, SDLoc(LHS),
-                                                LHS.getValueType(), Ops))
-    return Fold;
-
   // Move unary shuffles with identical masks after a vector binop:
   // VBinOp (shuffle A, Undef, Mask), (shuffle B, Undef, Mask))
   //   --> shuffle (VBinOp A, B), Undef, Mask
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 08598eeded7a..5dfb65ef131a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3367,13 +3367,13 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   }
   case ISD::FSHL:
   case ISD::FSHR:
-    if (TLI.expandFunnelShift(Node, Tmp1, DAG))
-      Results.push_back(Tmp1);
+    if (SDValue Expanded = TLI.expandFunnelShift(Node, DAG))
+      Results.push_back(Expanded);
     break;
   case ISD::ROTL:
   case ISD::ROTR:
-    if (TLI.expandROT(Node, true /*AllowVectorOps*/, Tmp1, DAG))
-      Results.push_back(Tmp1);
+    if (SDValue Expanded = TLI.expandROT(Node, true /*AllowVectorOps*/, DAG))
+      Results.push_back(Expanded);
     break;
   case ISD::SADDSAT:
   case ISD::UADDSAT:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 1fa4d88fcb4a..518e525e13d0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1277,8 +1277,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N, bool IsVP) {
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) {
   // Lower the rotate to shifts and ORs which can be promoted.
-  SDValue Res;
-  TLI.expandROT(N, true /*AllowVectorOps*/, Res, DAG);
+  SDValue Res = TLI.expandROT(N, true /*AllowVectorOps*/, DAG);
   ReplaceValueWith(SDValue(N, 0), Res);
   return SDValue();
 }
@@ -1286,7 +1285,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
   SDValue Hi = GetPromotedInteger(N->getOperand(0));
   SDValue Lo = GetPromotedInteger(N->getOperand(1));
-  SDValue Amount = GetPromotedInteger(N->getOperand(2));
+  SDValue Amt = GetPromotedInteger(N->getOperand(2));
 
   SDLoc DL(N);
   EVT OldVT = N->getOperand(0).getValueType();
@@ -1297,21 +1296,20 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
   unsigned NewBits = VT.getScalarSizeInBits();
 
   // Amount has to be interpreted modulo the old bit width.
-  Amount =
-      DAG.getNode(ISD::UREM, DL, VT, Amount, DAG.getConstant(OldBits, DL, VT));
+  Amt = DAG.getNode(ISD::UREM, DL, VT, Amt, DAG.getConstant(OldBits, DL, VT));
 
   // If the promoted type is twice the size (or more), then we use the
   // traditional funnel 'double' shift codegen. This isn't necessary if the
   // shift amount is constant.
   // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z % bw)) >> bw.
   // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z % bw)).
-  if (NewBits >= (2 * OldBits) && !isa<ConstantSDNode>(Amount) &&
+  if (NewBits >= (2 * OldBits) && !isa<ConstantSDNode>(Amt) &&
       !TLI.isOperationLegalOrCustom(Opcode, VT)) {
     SDValue HiShift = DAG.getConstant(OldBits, DL, VT);
     Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, HiShift);
     Lo = DAG.getZeroExtendInReg(Lo, DL, OldVT);
     SDValue Res = DAG.getNode(ISD::OR, DL, VT, Hi, Lo);
-    Res = DAG.getNode(IsFSHR ? ISD::SRL : ISD::SHL, DL, VT, Res, Amount);
+    Res = DAG.getNode(IsFSHR ? ISD::SRL : ISD::SHL, DL, VT, Res, Amt);
     if (!IsFSHR)
       Res = DAG.getNode(ISD::SRL, DL, VT, Res, HiShift);
     return Res;
@@ -1324,9 +1322,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
   // Increase Amount to shift the result into the lower bits of the promoted
   // type.
   if (IsFSHR)
-    Amount = DAG.getNode(ISD::ADD, DL, VT, Amount, ShiftOffset);
+    Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, ShiftOffset);
 
-  return DAG.getNode(Opcode, DL, VT, Hi, Lo, Amount);
+  return DAG.getNode(Opcode, DL, VT, Hi, Lo, Amt);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 98312f91d8c0..03dcd0f6d2c9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -83,7 +83,7 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
       SDValue Res(&Node, i);
       bool Failed = false;
       // Don't create a value in map.
-      auto ResId = (ValueToIdMap.count(Res)) ? ValueToIdMap[Res] : 0;
+      auto ResId = ValueToIdMap.lookup(Res);
 
       unsigned Mapped = 0;
       if (ResId && (ReplacedValues.find(ResId) != ReplacedValues.end())) {
@@ -301,7 +301,7 @@ ScanOperands:
       if (IgnoreNodeResults(N->getOperand(i).getNode()))
         continue;
 
-      const auto Op = N->getOperand(i);
+      const auto &Op = N->getOperand(i);
       LLVM_DEBUG(dbgs() << "Analyzing operand: "; Op.dump(&DAG));
       EVT OpVT = Op.getValueType();
       switch (getTypeAction(OpVT)) {
@@ -1007,11 +1007,7 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
 ///
 /// ValVT is the type of values that produced the boolean.
 SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) {
-  SDLoc dl(Bool);
-  EVT BoolVT = getSetCCResultType(ValVT);
-  ISD::NodeType ExtendCode =
-      TargetLowering::getExtendForContent(TLI.getBooleanContents(ValVT));
-  return DAG.getNode(ExtendCode, dl, BoolVT, Bool);
+  return TLI.promoteTargetBoolean(DAG, Bool, ValVT);
 }
 
 /// Return the lower LoVT bits of Op in Lo and the upper HiVT bits in Hi.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 88a28a3be53e..1493f36fcd3e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -254,69 +254,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
 
   SDNode *Node = DAG.UpdateNodeOperands(Op.getNode(), Ops);
 
-  if (Op.getOpcode() == ISD::LOAD) {
-    LoadSDNode *LD = cast<LoadSDNode>(Node);
-    ISD::LoadExtType ExtType = LD->getExtensionType();
-    if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD) {
-      LLVM_DEBUG(dbgs() << "\nLegalizing extending vector load: ";
-                 Node->dump(&DAG));
-      switch (TLI.getLoadExtAction(LD->getExtensionType(), LD->getValueType(0),
-                                   LD->getMemoryVT())) {
-      default: llvm_unreachable("This action is not supported yet!");
-      case TargetLowering::Legal:
-        return TranslateLegalizeResults(Op, Node);
-      case TargetLowering::Custom: {
-        SmallVector<SDValue, 2> ResultVals;
-        if (LowerOperationWrapper(Node, ResultVals)) {
-          if (ResultVals.empty())
-            return TranslateLegalizeResults(Op, Node);
-
-          Changed = true;
-          return RecursivelyLegalizeResults(Op, ResultVals);
-        }
-        LLVM_FALLTHROUGH;
-      }
-      case TargetLowering::Expand: {
-        Changed = true;
-        std::pair<SDValue, SDValue> Tmp = ExpandLoad(Node);
-        AddLegalizedOperand(Op.getValue(0), Tmp.first);
-        AddLegalizedOperand(Op.getValue(1), Tmp.second);
-        return Op.getResNo() ? Tmp.first : Tmp.second;
-      }
-      }
-    }
-  } else if (Op.getOpcode() == ISD::STORE) {
-    StoreSDNode *ST = cast<StoreSDNode>(Node);
-    EVT StVT = ST->getMemoryVT();
-    MVT ValVT = ST->getValue().getSimpleValueType();
-    if (StVT.isVector() && ST->isTruncatingStore()) {
-      LLVM_DEBUG(dbgs() << "\nLegalizing truncating vector store: ";
-                 Node->dump(&DAG));
-      switch (TLI.getTruncStoreAction(ValVT, StVT)) {
-      default: llvm_unreachable("This action is not supported yet!");
-      case TargetLowering::Legal:
-        return TranslateLegalizeResults(Op, Node);
-      case TargetLowering::Custom: {
-        SmallVector<SDValue, 1> ResultVals;
-        if (LowerOperationWrapper(Node, ResultVals)) {
-          if (ResultVals.empty())
-            return TranslateLegalizeResults(Op, Node);
-
-          Changed = true;
-          return RecursivelyLegalizeResults(Op, ResultVals);
-        }
-        LLVM_FALLTHROUGH;
-      }
-      case TargetLowering::Expand: {
-        Changed = true;
-        SDValue Chain = ExpandStore(Node);
-        AddLegalizedOperand(Op, Chain);
-        return Chain;
-      }
-      }
-    }
-  }
-
   bool HasVectorValueOrOp =
       llvm::any_of(Node->values(), [](EVT T) { return T.isVector(); }) ||
       llvm::any_of(Node->op_values(),
@@ -329,6 +266,22 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   switch (Op.getOpcode()) {
   default:
     return TranslateLegalizeResults(Op, Node);
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Node);
+    ISD::LoadExtType ExtType = LD->getExtensionType();
+    EVT LoadedVT = LD->getMemoryVT();
+    if (LoadedVT.isVector() && ExtType != ISD::NON_EXTLOAD)
+      Action = TLI.getLoadExtAction(ExtType, LD->getValueType(0), LoadedVT);
+    break;
+  }
+  case ISD::STORE: {
+    StoreSDNode *ST = cast<StoreSDNode>(Node);
+    EVT StVT = ST->getMemoryVT();
+    MVT ValVT = ST->getValue().getSimpleValueType();
+    if (StVT.isVector() && ST->isTruncatingStore())
+      Action = TLI.getTruncStoreAction(ValVT, StVT);
+    break;
+  }
   case ISD::MERGE_VALUES:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     // This operation lies about being legal: when it claims to be legal,
@@ -512,6 +465,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   switch (Action) {
   default: llvm_unreachable("This action is not supported yet!");
   case TargetLowering::Promote:
+    assert((Op.getOpcode() != ISD::LOAD && Op.getOpcode() != ISD::STORE) &&
+           "This action is not supported yet!");
     LLVM_DEBUG(dbgs() << "Promoting\n");
     Promote(Node, ResultVals);
     assert(!ResultVals.empty() && "No results for promotion?");
@@ -731,8 +686,16 @@ SDValue VectorLegalizer::ExpandStore(SDNode *N) {
 }
 
 void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
-  SDValue Tmp;
   switch (Node->getOpcode()) {
+  case ISD::LOAD: {
+    std::pair<SDValue, SDValue> Tmp = ExpandLoad(Node);
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+    return;
+  }
+  case ISD::STORE:
+    Results.push_back(ExpandStore(Node));
+    return;
   case ISD::MERGE_VALUES:
     for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
       Results.push_back(Node->getOperand(i));
@@ -804,15 +767,15 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     break;
   case ISD::FSHL:
   case ISD::FSHR:
-    if (TLI.expandFunnelShift(Node, Tmp, DAG)) {
-      Results.push_back(Tmp);
+    if (SDValue Expanded = TLI.expandFunnelShift(Node, DAG)) {
+      Results.push_back(Expanded);
       return;
     }
     break;
   case ISD::ROTL:
   case ISD::ROTR:
-    if (TLI.expandROT(Node, false /*AllowVectorOps*/, Tmp, DAG)) {
-      Results.push_back(Tmp);
+    if (SDValue Expanded = TLI.expandROT(Node, false /*AllowVectorOps*/, DAG)) {
+      Results.push_back(Expanded);
       return;
     }
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 2695ed36991c..3d5c4c5b1cae 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -168,10 +168,9 @@ void ResourcePriorityQueue::initNodes(std::vector<SUnit> &sunits) {
   SUnits = &sunits;
   NumNodesSolelyBlocking.resize(SUnits->size(), 0);
 
-  for (unsigned i = 0, e = SUnits->size(); i != e; ++i) {
-    SUnit *SU = &(*SUnits)[i];
-    initNumRegDefsLeft(SU);
-    SU->NodeQueueId = 0;
+  for (SUnit &SU : *SUnits) {
+    initNumRegDefsLeft(&SU);
+    SU.NodeQueueId = 0;
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 84e6d2a16422..aec2cf38b400 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -442,33 +442,32 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
   bool UnitLatencies = forceUnitLatencies();
 
   // Pass 2: add the preds, succs, etc.
-  for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
-    SUnit *SU = &SUnits[su];
-    SDNode *MainNode = SU->getNode();
+  for (SUnit &SU : SUnits) {
+    SDNode *MainNode = SU.getNode();
 
     if (MainNode->isMachineOpcode()) {
       unsigned Opc = MainNode->getMachineOpcode();
       const MCInstrDesc &MCID = TII->get(Opc);
       for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
         if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
-          SU->isTwoAddress = true;
+          SU.isTwoAddress = true;
           break;
         }
       }
       if (MCID.isCommutable())
-        SU->isCommutable = true;
+        SU.isCommutable = true;
     }
 
     // Find all predecessors and successors of the group.
-    for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) {
+    for (SDNode *N = SU.getNode(); N; N = N->getGluedNode()) {
       if (N->isMachineOpcode() &&
           TII->get(N->getMachineOpcode()).getImplicitDefs()) {
-        SU->hasPhysRegClobbers = true;
+        SU.hasPhysRegClobbers = true;
         unsigned NumUsed = InstrEmitter::CountResults(N);
         while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1))
           --NumUsed;    // Skip over unused values at the end.
         if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs())
-          SU->hasPhysRegDefs = true;
+          SU.hasPhysRegDefs = true;
       }
 
       for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
@@ -477,7 +476,8 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         if (isPassiveNode(OpN)) continue;   // Not scheduled.
         SUnit *OpSU = &SUnits[OpN->getNodeId()];
         assert(OpSU && "Node has no SUnit!");
-        if (OpSU == SU) continue;           // In the same group.
+        if (OpSU == &SU)
+          continue; // In the same group.
 
         EVT OpVT = N->getOperand(i).getValueType();
         assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!");
@@ -508,10 +508,10 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         Dep.setLatency(OpLatency);
         if (!isChain && !UnitLatencies) {
           computeOperandLatency(OpN, N, i, Dep);
-          ST.adjustSchedDependency(OpSU, DefIdx, SU, i, Dep);
+          ST.adjustSchedDependency(OpSU, DefIdx, &SU, i, Dep);
         }
 
-        if (!SU->addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
+        if (!SU.addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
           // Multiple register uses are combined in the same SUnit. For example,
           // we could have a set of glued nodes with all their defs consumed by
           // another set of glued nodes. Register pressure tracking sees this as
@@ -721,10 +721,7 @@ void ScheduleDAGSDNodes::dumpSchedule() const {
 ///
 void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
   unsigned ScheduledNodes = ScheduleDAG::VerifyScheduledDAG(isBottomUp);
-  unsigned Noops = 0;
-  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
-    if (!Sequence[i])
-      ++Noops;
+  unsigned Noops = llvm::count(Sequence, nullptr);
   assert(Sequence.size() - Noops == ScheduledNodes &&
          "The number of nodes scheduled doesn't match the expected number!");
 }
@@ -911,8 +908,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
     }
   }
 
-  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
-    SUnit *SU = Sequence[i];
+  for (SUnit *SU : Sequence) {
     if (!SU) {
       // Null SUnit* is a noop.
       TII->insertNoop(*Emitter.getBlock(), InsertPos);
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index 540a6e3efbe1..10940478010e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -169,11 +169,11 @@ void ScheduleDAGVLIW::listScheduleTopDown() {
   releaseSuccessors(&EntrySU);
 
   // All leaves to AvailableQueue.
-  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+  for (SUnit &SU : SUnits) {
     // It is available if it has no predecessors.
-    if (SUnits[i].Preds.empty()) {
-      AvailableQueue->push(&SUnits[i]);
-      SUnits[i].isAvailable = true;
+    if (SU.Preds.empty()) {
+      AvailableQueue->push(&SU);
+      SU.isAvailable = true;
     }
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c282e03387dd..2ae0d4df7b77 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2499,7 +2499,8 @@ bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask,
 /// sense to specify which elements are demanded or undefined, therefore
 /// they are simply ignored.
 bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
-                                APInt &UndefElts, unsigned Depth) {
+                                APInt &UndefElts, unsigned Depth) const {
+  unsigned Opcode = V.getOpcode();
   EVT VT = V.getValueType();
   assert(VT.isVector() && "Vector type expected");
 
@@ -2511,7 +2512,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
 
   // Deal with some common cases here that work for both fixed and scalable
   // vector types.
-  switch (V.getOpcode()) {
+  switch (Opcode) {
   case ISD::SPLAT_VECTOR:
     UndefElts = V.getOperand(0).isUndef()
                     ? APInt::getAllOnes(DemandedElts.getBitWidth())
@@ -2537,7 +2538,12 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
     return isSplatValue(V.getOperand(0), DemandedElts, UndefElts, Depth + 1);
-  }
+  default:
+    if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN ||
+        Opcode == ISD::INTRINSIC_W_CHAIN || Opcode == ISD::INTRINSIC_VOID)
+      return TLI->isSplatValueForTargetNode(V, DemandedElts, UndefElts, Depth);
+    break;
+}
 
   // We don't support other cases than those above for scalable vectors at
   // the moment.
@@ -2548,7 +2554,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
   assert(NumElts == DemandedElts.getBitWidth() && "Vector size mismatch");
   UndefElts = APInt::getZero(NumElts);
 
-  switch (V.getOpcode()) {
+  switch (Opcode) {
   case ISD::BUILD_VECTOR: {
     SDValue Scl;
     for (unsigned i = 0; i != NumElts; ++i) {
@@ -2600,13 +2606,30 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
     }
     break;
   }
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG: {
+    // Widen the demanded elts by the src element count.
+    SDValue Src = V.getOperand(0);
+    // We don't support scalable vectors at the moment.
+    if (Src.getValueType().isScalableVector())
+      return false;
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt UndefSrcElts;
+    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts);
+    if (isSplatValue(Src, DemandedSrcElts, UndefSrcElts, Depth + 1)) {
+      UndefElts = UndefSrcElts.truncOrSelf(NumElts);
+      return true;
+    }
+    break;
+  }
   }
 
   return false;
 }
 
 /// Helper wrapper to main isSplatValue function.
-bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) {
+bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) const {
   EVT VT = V.getValueType();
   assert(VT.isVector() && "Vector type expected");
 
@@ -5291,9 +5314,10 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   if (isUndef(Opcode, Ops))
     return getUNDEF(VT);
 
-  // Handle the case of two scalars.
+  // Handle binops special cases.
   if (NumOps == 2) {
-    // TODO: Move foldConstantFPMath here?
+    if (SDValue CFP = foldConstantFPMath(Opcode, DL, VT, Ops[0], Ops[1]))
+      return CFP;
 
     if (auto *C1 = dyn_cast<ConstantSDNode>(Ops[0])) {
       if (auto *C2 = dyn_cast<ConstantSDNode>(Ops[1])) {
@@ -5463,10 +5487,11 @@ SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL,
   //       should. That will require dealing with a potentially non-default
   //       rounding mode, checking the "opStatus" return value from the APFloat
   //       math calculations, and possibly other variations.
-  auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1.getNode());
-  auto *N2CFP = dyn_cast<ConstantFPSDNode>(N2.getNode());
+  ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, /*AllowUndefs*/ false);
+  ConstantFPSDNode *N2CFP = isConstOrConstSplatFP(N2, /*AllowUndefs*/ false);
   if (N1CFP && N2CFP) {
-    APFloat C1 = N1CFP->getValueAPF(), C2 = N2CFP->getValueAPF();
+    APFloat C1 = N1CFP->getValueAPF(); // make copy
+    const APFloat &C2 = N2CFP->getValueAPF();
     switch (Opcode) {
     case ISD::FADD:
       C1.add(C2, APFloat::rmNearestTiesToEven);
@@ -5486,6 +5511,14 @@ SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL,
     case ISD::FCOPYSIGN:
       C1.copySign(C2);
       return getConstantFP(C1, DL, VT);
+    case ISD::FMINNUM:
+      return getConstantFP(minnum(C1, C2), DL, VT);
+    case ISD::FMAXNUM:
+      return getConstantFP(maxnum(C1, C2), DL, VT);
+    case ISD::FMINIMUM:
+      return getConstantFP(minimum(C1, C2), DL, VT);
+    case ISD::FMAXIMUM:
+      return getConstantFP(maximum(C1, C2), DL, VT);
     default: break;
     }
   }
@@ -5502,8 +5535,9 @@ SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL,
   switch (Opcode) {
   case ISD::FSUB:
     // -0.0 - undef --> undef (consistent with "fneg undef")
-    if (N1CFP && N1CFP->getValueAPF().isNegZero() && N2.isUndef())
-      return getUNDEF(VT);
+    if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, /*AllowUndefs*/ true))
+      if (N1C && N1C->getValueAPF().isNegZero() && N2.isUndef())
+        return getUNDEF(VT);
     LLVM_FALLTHROUGH;
 
   case ISD::FADD:
@@ -5962,9 +5996,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}))
     return SV;
 
-  if (SDValue V = foldConstantFPMath(Opcode, DL, VT, N1, N2))
-    return V;
-
   // Canonicalize an UNDEF to the RHS, even over a constant.
   if (N1.isUndef()) {
     if (TLI->isCommutativeBinOp(Opcode)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 7726a0007e44..63cd723cf6da 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1036,7 +1036,6 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa,
   AA = aa;
   GFI = gfi;
   LibInfo = li;
-  DL = &DAG.getDataLayout();
   Context = DAG.getContext();
   LPadToCallSiteMap.clear();
   SL->init(DAG.getTargetLoweringInfo(), TM, DAG.getDataLayout());
@@ -1626,6 +1625,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(C))
       return getValue(Equiv->getGlobalValue());
 
+    if (const auto *NC = dyn_cast<NoCFIValue>(C))
+      return getValue(NC->getGlobalValue());
+
     VectorType *VecTy = cast<VectorType>(V->getType());
 
     // Now that we know the number and type of the elements, get that number of
@@ -1921,8 +1923,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
                         DAG.getDataLayout().getAllocaAddrSpace()),
                     PtrValueVTs);
 
-    SDValue RetPtr = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
-                                        DemoteReg, PtrValueVTs[0]);
+    SDValue RetPtr =
+        DAG.getCopyFromReg(Chain, getCurSDLoc(), DemoteReg, PtrValueVTs[0]);
     SDValue RetOp = getValue(I.getOperand(0));
 
     SmallVector<EVT, 4> ValueVTs, MemVTs;
@@ -2657,7 +2659,8 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   SDLoc dl = getCurSDLoc();
   SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
   const Module &M = *ParentBB->getParent()->getFunction().getParent();
-  Align Align = DL->getPrefTypeAlign(Type::getInt8PtrTy(M.getContext()));
+  Align Align =
+      DAG.getDataLayout().getPrefTypeAlign(Type::getInt8PtrTy(M.getContext()));
 
   // Generate code to load the content of the guard slot.
   SDValue GuardVal = DAG.getLoad(
@@ -3058,14 +3061,14 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
 void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First,
                                            MachineBasicBlock *Last) {
   // Update JTCases.
-  for (unsigned i = 0, e = SL->JTCases.size(); i != e; ++i)
-    if (SL->JTCases[i].first.HeaderBB == First)
-      SL->JTCases[i].first.HeaderBB = Last;
+  for (JumpTableBlock &JTB : SL->JTCases)
+    if (JTB.first.HeaderBB == First)
+      JTB.first.HeaderBB = Last;
 
   // Update BitTestCases.
-  for (unsigned i = 0, e = SL->BitTestCases.size(); i != e; ++i)
-    if (SL->BitTestCases[i].Parent == First)
-      SL->BitTestCases[i].Parent = Last;
+  for (BitTestBlock &BTB : SL->BitTestCases)
+    if (BTB.Parent == First)
+      BTB.Parent = Last;
 }
 
 void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
@@ -3111,6 +3114,8 @@ void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
 
 void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
   SDNodeFlags Flags;
+  if (auto *FPOp = dyn_cast<FPMathOperator>(&I))
+    Flags.copyFMF(*FPOp);
 
   SDValue Op = getValue(I.getOperand(0));
   SDValue UnNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op.getValueType(),
@@ -3881,7 +3886,8 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
       if (Field) {
         // N = N + Offset
-        uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
+        uint64_t Offset =
+            DAG.getDataLayout().getStructLayout(StTy)->getElementOffset(Field);
 
         // In an inbounds GEP with an offset that is nonnegative even when
         // interpreted as signed, assume there is no unsigned overflow.
@@ -3898,7 +3904,8 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       // (and fix up the result later).
       unsigned IdxSize = DAG.getDataLayout().getIndexSizeInBits(AS);
       MVT IdxTy = MVT::getIntegerVT(IdxSize);
-      TypeSize ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+      TypeSize ElementSize =
+          DAG.getDataLayout().getTypeAllocSize(GTI.getIndexedType());
       // We intentionally mask away the high bits here; ElementSize may not
       // fit in IdxTy.
       APInt ElementMul(IdxSize, ElementSize.getKnownMinSize());
@@ -4788,7 +4795,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     }
 
     // Use TargetConstant instead of a regular constant for immarg.
-    EVT VT = TLI.getValueType(*DL, Arg->getType(), true);
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), Arg->getType(), true);
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(Arg)) {
       assert(CI->getBitWidth() <= 64 &&
              "large intrinsic immediates not handled");
@@ -6571,7 +6578,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     } else {
       EVT PtrTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
       const Value *Global = TLI.getSDagStackGuard(M);
-      Align Align = DL->getPrefTypeAlign(Global->getType());
+      Align Align = DAG.getDataLayout().getPrefTypeAlign(Global->getType());
       Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
                         MachinePointerInfo(Global, 0), Align,
                         MachineMemOperand::MOVolatile);
@@ -7127,12 +7134,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     }
     SDValue VectorStep = DAG.getStepVector(sdl, VecTy);
     SDValue VectorInduction = DAG.getNode(
-        ISD::UADDO, sdl, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep);
-    SDValue SetCC = DAG.getSetCC(sdl, CCVT, VectorInduction.getValue(0),
+        ISD::UADDSAT, sdl, VecTy, VectorIndex, VectorStep);
+    SDValue SetCC = DAG.getSetCC(sdl, CCVT, VectorInduction,
                                  VectorTripCount, ISD::CondCode::SETULT);
-    setValue(&I, DAG.getNode(ISD::AND, sdl, CCVT,
-                             DAG.getNOT(sdl, VectorInduction.getValue(1), CCVT),
-                             SetCC));
+    setValue(&I, SetCC);
     return;
   }
   case Intrinsic::experimental_vector_insert: {
@@ -7317,32 +7322,26 @@ static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) {
 
 void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT,
                                             SmallVector<SDValue, 7> &OpValues,
-                                            bool isGather) {
+                                            bool IsGather) {
   SDLoc DL = getCurSDLoc();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   Value *PtrOperand = VPIntrin.getArgOperand(0);
-  MaybeAlign Alignment = DAG.getEVTAlign(VT);
+  MaybeAlign Alignment = VPIntrin.getPointerAlignment();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT);
   AAMDNodes AAInfo = VPIntrin.getAAMetadata();
   const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range);
   SDValue LD;
   bool AddToChain = true;
-  if (!isGather) {
+  if (!IsGather) {
     // Do not serialize variable-length loads of constant memory with
     // anything.
-    MemoryLocation ML;
-    if (VT.isScalableVector())
-      ML = MemoryLocation::getAfter(PtrOperand);
-    else
-      ML = MemoryLocation(
-          PtrOperand,
-          LocationSize::precise(
-              DAG.getDataLayout().getTypeStoreSize(VPIntrin.getType())),
-          AAInfo);
+    MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
     AddToChain = !AA || !AA->pointsToConstantMemory(ML);
     SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
         MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
-        VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo, Ranges);
+        MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
     LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2],
                        MMO, false /*IsExpanding */);
   } else {
@@ -7380,18 +7379,20 @@ void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT,
 
 void SelectionDAGBuilder::visitVPStoreScatter(const VPIntrinsic &VPIntrin,
                                               SmallVector<SDValue, 7> &OpValues,
-                                              bool isScatter) {
+                                              bool IsScatter) {
   SDLoc DL = getCurSDLoc();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   Value *PtrOperand = VPIntrin.getArgOperand(1);
   EVT VT = OpValues[0].getValueType();
-  MaybeAlign Alignment = DAG.getEVTAlign(VT);
+  MaybeAlign Alignment = VPIntrin.getPointerAlignment();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT);
   AAMDNodes AAInfo = VPIntrin.getAAMetadata();
   SDValue ST;
-  if (!isScatter) {
+  if (!IsScatter) {
     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
         MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
-        VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo);
+        MemoryLocation::UnknownSize, *Alignment, AAInfo);
     ST =
         DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], OpValues[1],
                        OpValues[2], OpValues[3], MMO, false /* IsTruncating */);
@@ -7690,8 +7691,9 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
     LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
                                          PointerType::getUnqual(LoadTy));
 
-    if (const Constant *LoadCst = ConstantFoldLoadFromConstPtr(
-            const_cast<Constant *>(LoadInput), LoadTy, *Builder.DL))
+    if (const Constant *LoadCst =
+            ConstantFoldLoadFromConstPtr(const_cast<Constant *>(LoadInput),
+                                         LoadTy, Builder.DAG.getDataLayout()))
       return Builder.getValue(LoadCst);
   }
 
@@ -9646,8 +9648,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   // We push in swifterror return as the last element of CLI.Ins.
   ArgListTy &Args = CLI.getArgs();
   if (supportSwiftError()) {
-    for (unsigned i = 0, e = Args.size(); i != e; ++i) {
-      if (Args[i].IsSwiftError) {
+    for (const ArgListEntry &Arg : Args) {
+      if (Arg.IsSwiftError) {
         ISD::InputArg MyFlags;
         MyFlags.VT = getPointerTy(DL);
         MyFlags.ArgVT = EVT(getPointerTy(DL));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index d6122aa0a739..ea48042a5dcf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -190,7 +190,6 @@ public:
   static const unsigned LowestSDNodeOrder = 1;
 
   SelectionDAG &DAG;
-  const DataLayout *DL = nullptr;
   AAResults *AA = nullptr;
   const TargetLibraryInfo *LibInfo;
 
@@ -568,9 +567,9 @@ private:
   void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
   void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
   void visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT,
-                         SmallVector<SDValue, 7> &OpValues, bool isGather);
+                         SmallVector<SDValue, 7> &OpValues, bool IsGather);
   void visitVPStoreScatter(const VPIntrinsic &VPIntrin,
-                           SmallVector<SDValue, 7> &OpValues, bool isScatter);
+                           SmallVector<SDValue, 7> &OpValues, bool IsScatter);
   void visitVectorPredicationIntrinsic(const VPIntrinsic &VPIntrin);
 
   void visitVAStart(const CallInst &I);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index c7e37cf8ca14..77e11b364588 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -297,7 +297,7 @@ TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 #ifndef NDEBUG
   dbgs() << "If a target marks an instruction with "
           "'usesCustomInserter', it must implement "
-          "TargetLowering::EmitInstrWithCustomInserter!";
+          "TargetLowering::EmitInstrWithCustomInserter!\n";
 #endif
   llvm_unreachable(nullptr);
 }
@@ -1784,27 +1784,25 @@ SelectionDAGISel::FinishBasicBlock() {
     }
 
     // Update PHI Nodes
-    for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
-         pi != pe; ++pi) {
-      MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[pi].first);
+    for (const std::pair<MachineInstr *, unsigned> &P :
+         FuncInfo->PHINodesToUpdate) {
+      MachineInstrBuilder PHI(*MF, P.first);
       MachineBasicBlock *PHIBB = PHI->getParent();
       assert(PHI->isPHI() &&
              "This is not a machine PHI node that we are updating!");
       // This is "default" BB. We have two jumps to it. From "header" BB and
       // from last "case" BB, unless the latter was skipped.
       if (PHIBB == BTB.Default) {
-        PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(BTB.Parent);
+        PHI.addReg(P.second).addMBB(BTB.Parent);
         if (!BTB.ContiguousRange) {
-          PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second)
-              .addMBB(BTB.Cases.back().ThisBB);
+          PHI.addReg(P.second).addMBB(BTB.Cases.back().ThisBB);
          }
       }
       // One of "cases" BB.
-      for (unsigned j = 0, ej = BTB.Cases.size();
-           j != ej; ++j) {
-        MachineBasicBlock* cBB = BTB.Cases[j].ThisBB;
+      for (const SwitchCG::BitTestCase &BT : BTB.Cases) {
+        MachineBasicBlock* cBB = BT.ThisBB;
         if (cBB->isSuccessor(PHIBB))
-          PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(cBB);
+          PHI.addReg(P.second).addMBB(cBB);
       }
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 737695b5eabe..e6b06ab93d6b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3136,6 +3136,19 @@ bool TargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
   return false;
 }
 
+bool TargetLowering::isSplatValueForTargetNode(SDValue Op,
+                                               const APInt &DemandedElts,
+                                               APInt &UndefElts,
+                                               unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use isSplatValue if you don't know whether Op"
+         " is a target node!");
+  return false;
+}
+
 // FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must
 // work with truncating build vectors and vectors with elements of less than
 // 8 bits.
@@ -4853,13 +4866,9 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
       }
 
       // Now select chosen alternative in each constraint.
-      for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
-           cIndex != eIndex; ++cIndex) {
-        AsmOperandInfo &cInfo = ConstraintOperands[cIndex];
-        if (cInfo.Type == InlineAsm::isClobber)
-          continue;
-        cInfo.selectAlternative(bestMAIndex);
-      }
+      for (AsmOperandInfo &cInfo : ConstraintOperands)
+        if (cInfo.Type != InlineAsm::isClobber)
+          cInfo.selectAlternative(bestMAIndex);
     }
   }
 
@@ -4927,9 +4936,9 @@ TargetLowering::ConstraintWeight
   ConstraintWeight BestWeight = CW_Invalid;
 
   // Loop over the options, keeping track of the most general one.
-  for (unsigned i = 0, e = rCodes->size(); i != e; ++i) {
+  for (const std::string &rCode : *rCodes) {
     ConstraintWeight weight =
-      getSingleConstraintMatchWeight(info, (*rCodes)[i].c_str());
+        getSingleConstraintMatchWeight(info, rCode.c_str());
     if (weight > BestWeight)
       BestWeight = weight;
   }
@@ -6550,15 +6559,15 @@ static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) {
       true);
 }
 
-bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
-                                       SelectionDAG &DAG) const {
+SDValue TargetLowering::expandFunnelShift(SDNode *Node,
+                                          SelectionDAG &DAG) const {
   EVT VT = Node->getValueType(0);
 
   if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) ||
                         !isOperationLegalOrCustom(ISD::SRL, VT) ||
                         !isOperationLegalOrCustom(ISD::SUB, VT) ||
                         !isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
-    return false;
+    return SDValue();
 
   SDValue X = Node->getOperand(0);
   SDValue Y = Node->getOperand(1);
@@ -6592,8 +6601,7 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
       }
       Z = DAG.getNOT(DL, Z, ShVT);
     }
-    Result = DAG.getNode(RevOpcode, DL, VT, X, Y, Z);
-    return true;
+    return DAG.getNode(RevOpcode, DL, VT, X, Y, Z);
   }
 
   SDValue ShX, ShY;
@@ -6633,13 +6641,12 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
       ShY = DAG.getNode(ISD::SRL, DL, VT, Y, ShAmt);
     }
   }
-  Result = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
-  return true;
+  return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
 }
 
 // TODO: Merge with expandFunnelShift.
-bool TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
-                               SDValue &Result, SelectionDAG &DAG) const {
+SDValue TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
+                                  SelectionDAG &DAG) const {
   EVT VT = Node->getValueType(0);
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   bool IsLeft = Node->getOpcode() == ISD::ROTL;
@@ -6650,12 +6657,12 @@ bool TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
   EVT ShVT = Op1.getValueType();
   SDValue Zero = DAG.getConstant(0, DL, ShVT);
 
-  // If a rotate in the other direction is supported, use it.
+  // If a rotate in the other direction is more supported, use it.
   unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL;
-  if (isOperationLegalOrCustom(RevRot, VT) && isPowerOf2_32(EltSizeInBits)) {
+  if (!isOperationLegalOrCustom(Node->getOpcode(), VT) &&
+      isOperationLegalOrCustom(RevRot, VT) && isPowerOf2_32(EltSizeInBits)) {
     SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1);
-    Result = DAG.getNode(RevRot, DL, VT, Op0, Sub);
-    return true;
+    return DAG.getNode(RevRot, DL, VT, Op0, Sub);
   }
 
   if (!AllowVectorOps && VT.isVector() &&
@@ -6664,7 +6671,7 @@ bool TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
        !isOperationLegalOrCustom(ISD::SUB, VT) ||
        !isOperationLegalOrCustomOrPromote(ISD::OR, VT) ||
        !isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
-    return false;
+    return SDValue();
 
   unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL;
   unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL;
@@ -6690,8 +6697,7 @@ bool TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
     HsVal =
         DAG.getNode(HsOpc, DL, VT, DAG.getNode(HsOpc, DL, VT, Op0, One), HsAmt);
   }
-  Result = DAG.getNode(ISD::OR, DL, VT, ShVal, HsVal);
-  return true;
+  return DAG.getNode(ISD::OR, DL, VT, ShVal, HsVal);
 }
 
 void TargetLowering::expandShiftParts(SDNode *Node, SDValue &Lo, SDValue &Hi,
@@ -8048,7 +8054,8 @@ SDValue TargetLowering::expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const {
   if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
     return DAG.UnrollVectorOp(Node);
 
-  SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
+  EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue Cond = DAG.getSetCC(DL, BoolVT, Op0, Op1, CC);
   return DAG.getSelect(DL, VT, Cond, Op0, Op1);
 }
 
diff --git a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
index 86b559fd6413..43a54ce33bf0 100644
--- a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
+++ b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
@@ -162,8 +162,8 @@ Type *ShadowStackGCLowering::GetConcreteStackEntryType(Function &F) {
   // doInitialization creates the generic version of this type.
   std::vector<Type *> EltTys;
   EltTys.push_back(StackEntryTy);
-  for (size_t I = 0; I != Roots.size(); I++)
-    EltTys.push_back(Roots[I].second->getAllocatedType());
+  for (const std::pair<CallInst *, AllocaInst *> &Root : Roots)
+    EltTys.push_back(Root.second->getAllocatedType());
 
   return StructType::create(EltTys, ("gc_stackentry." + F.getName()).str());
 }
@@ -240,8 +240,8 @@ void ShadowStackGCLowering::CollectRoots(Function &F) {
   SmallVector<std::pair<CallInst *, AllocaInst *>, 16> MetaRoots;
 
   for (BasicBlock &BB : F)
-    for (BasicBlock::iterator II = BB.begin(), E = BB.end(); II != E;)
-      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
+    for (Instruction &I : BB)
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(&I))
         if (Function *F = CI->getCalledFunction())
           if (F->getIntrinsicID() == Intrinsic::gcroot) {
             std::pair<CallInst *, AllocaInst *> Pair = std::make_pair(
@@ -377,9 +377,9 @@ bool ShadowStackGCLowering::runOnFunction(Function &F) {
   // Delete the original allocas (which are no longer used) and the intrinsic
   // calls (which are no longer valid). Doing this last avoids invalidating
   // iterators.
-  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
-    Roots[I].first->eraseFromParent();
-    Roots[I].second->eraseFromParent();
+  for (std::pair<CallInst *, AllocaInst *> &Root : Roots) {
+    Root.first->eraseFromParent();
+    Root.second->eraseFromParent();
   }
 
   Roots.clear();
diff --git a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
index 5ccfacfc26dc..3640296adbca 100644
--- a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -131,15 +131,15 @@ bool StackMapLiveness::calculateLiveness(MachineFunction &MF) {
     bool HasStackMap = false;
     // Reverse iterate over all instructions and add the current live register
     // set to an instruction if we encounter a patchpoint instruction.
-    for (auto I = MBB.rbegin(), E = MBB.rend(); I != E; ++I) {
-      if (I->getOpcode() == TargetOpcode::PATCHPOINT) {
-        addLiveOutSetToMI(MF, *I);
+    for (MachineInstr &MI : llvm::reverse(MBB)) {
+      if (MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+        addLiveOutSetToMI(MF, MI);
         HasChanged = true;
         HasStackMap = true;
         ++NumStackMaps;
       }
-      LLVM_DEBUG(dbgs() << "   " << LiveRegs << "   " << *I);
-      LiveRegs.stepBackward(*I);
+      LLVM_DEBUG(dbgs() << "   " << LiveRegs << "   " << MI);
+      LiveRegs.stepBackward(MI);
     }
     ++NumBBsVisited;
     if (!HasStackMap)
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 7445f77c955d..6765fd274686 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -162,7 +162,7 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge,
 }
 
 bool StackProtector::HasAddressTaken(const Instruction *AI,
-                                     uint64_t AllocSize) {
+                                     TypeSize AllocSize) {
   const DataLayout &DL = M->getDataLayout();
   for (const User *U : AI->users()) {
     const auto *I = cast<Instruction>(U);
@@ -170,7 +170,8 @@ bool StackProtector::HasAddressTaken(const Instruction *AI,
     // the bounds of the allocated object.
     Optional<MemoryLocation> MemLoc = MemoryLocation::getOrNone(I);
     if (MemLoc.hasValue() && MemLoc->Size.hasValue() &&
-        MemLoc->Size.getValue() > AllocSize)
+        !TypeSize::isKnownGE(AllocSize,
+                             TypeSize::getFixed(MemLoc->Size.getValue())))
       return true;
     switch (I->getOpcode()) {
     case Instruction::Store:
@@ -203,13 +204,19 @@ bool StackProtector::HasAddressTaken(const Instruction *AI,
       // would use it could also be out-of-bounds meaning stack protection is
       // required.
       const GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
-      unsigned TypeSize = DL.getIndexTypeSizeInBits(I->getType());
-      APInt Offset(TypeSize, 0);
-      APInt MaxOffset(TypeSize, AllocSize);
-      if (!GEP->accumulateConstantOffset(DL, Offset) || Offset.ugt(MaxOffset))
+      unsigned IndexSize = DL.getIndexTypeSizeInBits(I->getType());
+      APInt Offset(IndexSize, 0);
+      if (!GEP->accumulateConstantOffset(DL, Offset))
+        return true;
+      TypeSize OffsetSize = TypeSize::Fixed(Offset.getLimitedValue());
+      if (!TypeSize::isKnownGT(AllocSize, OffsetSize))
         return true;
       // Adjust AllocSize to be the space remaining after this offset.
-      if (HasAddressTaken(I, AllocSize - Offset.getLimitedValue()))
+      // We can't subtract a fixed size from a scalable one, so in that case
+      // assume the scalable value is of minimum size.
+      TypeSize NewAllocSize =
+          TypeSize::Fixed(AllocSize.getKnownMinValue()) - OffsetSize;
+      if (HasAddressTaken(I, NewAllocSize))
         return true;
       break;
     }
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index f49ba5ccd447..17e6f51d0899 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -325,8 +325,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
 
   LLVM_DEBUG(dbgs() << "Color spill slot intervals:\n");
   bool Changed = false;
-  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
-    LiveInterval *li = SSIntervals[i];
+  for (LiveInterval *li : SSIntervals) {
     int SS = Register::stackSlot2Index(li->reg());
     int NewSS = ColorSlot(li);
     assert(NewSS >= 0 && "Stack coloring failed?");
@@ -338,8 +337,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   }
 
   LLVM_DEBUG(dbgs() << "\nSpill slots after coloring:\n");
-  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
-    LiveInterval *li = SSIntervals[i];
+  for (LiveInterval *li : SSIntervals) {
     int SS = Register::stackSlot2Index(li->reg());
     li->setWeight(SlotWeights[SS]);
   }
@@ -347,8 +345,8 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   llvm::stable_sort(SSIntervals, IntervalSorter());
 
 #ifndef NDEBUG
-  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i)
-    LLVM_DEBUG(SSIntervals[i]->dump());
+  for (LiveInterval *li : SSIntervals)
+    LLVM_DEBUG(li->dump());
   LLVM_DEBUG(dbgs() << '\n');
 #endif
 
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 54fc6ee45d00..68a7b80d6146 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -207,35 +207,34 @@ bool TailDuplicator::tailDuplicateAndUpdate(
       // Add the new vregs as available values.
       DenseMap<Register, AvailableValsTy>::iterator LI =
           SSAUpdateVals.find(VReg);
-      for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
-        MachineBasicBlock *SrcBB = LI->second[j].first;
-        Register SrcReg = LI->second[j].second;
+      for (std::pair<MachineBasicBlock *, Register> &J : LI->second) {
+        MachineBasicBlock *SrcBB = J.first;
+        Register SrcReg = J.second;
         SSAUpdate.AddAvailableValue(SrcBB, SrcReg);
       }
 
+      SmallVector<MachineOperand *> DebugUses;
       // Rewrite uses that are outside of the original def's block.
-      MachineRegisterInfo::use_iterator UI = MRI->use_begin(VReg);
-      // Only remove instructions after loop, as DBG_VALUE_LISTs with multiple
-      // uses of VReg may invalidate the use iterator when erased.
-      SmallPtrSet<MachineInstr *, 4> InstrsToRemove;
-      while (UI != MRI->use_end()) {
-        MachineOperand &UseMO = *UI;
+      for (MachineOperand &UseMO :
+           llvm::make_early_inc_range(MRI->use_operands(VReg))) {
         MachineInstr *UseMI = UseMO.getParent();
-        ++UI;
+        // Rewrite debug uses last so that they can take advantage of any
+        // register mappings introduced by other users in its BB, since we
+        // cannot create new register definitions specifically for the debug
+        // instruction (as debug instructions should not affect CodeGen).
         if (UseMI->isDebugValue()) {
-          // SSAUpdate can replace the use with an undef. That creates
-          // a debug instruction that is a kill.
-          // FIXME: Should it SSAUpdate job to delete debug instructions
-          // instead of replacing the use with undef?
-          InstrsToRemove.insert(UseMI);
+          DebugUses.push_back(&UseMO);
           continue;
         }
         if (UseMI->getParent() == DefBB && !UseMI->isPHI())
           continue;
         SSAUpdate.RewriteUse(UseMO);
       }
-      for (auto *MI : InstrsToRemove)
-        MI->eraseFromParent();
+      for (auto *UseMO : DebugUses) {
+        MachineInstr *UseMI = UseMO->getParent();
+        UseMO->setReg(
+            SSAUpdate.GetValueInMiddleOfBlock(UseMI->getParent(), true));
+      }
     }
 
     SSAUpdateVRs.clear();
@@ -511,8 +510,8 @@ void TailDuplicator::updateSuccessorsPHIs(
           SSAUpdateVals.find(Reg);
       if (LI != SSAUpdateVals.end()) {
         // This register is defined in the tail block.
-        for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
-          MachineBasicBlock *SrcBB = LI->second[j].first;
+        for (const std::pair<MachineBasicBlock *, Register> &J : LI->second) {
+          MachineBasicBlock *SrcBB = J.first;
           // If we didn't duplicate a bb into a particular predecessor, we
           // might still have added an entry to SSAUpdateVals to correcly
           // recompute SSA. If that case, avoid adding a dummy extra argument
@@ -520,7 +519,7 @@ void TailDuplicator::updateSuccessorsPHIs(
           if (!SrcBB->isSuccessor(SuccBB))
             continue;
 
-          Register SrcReg = LI->second[j].second;
+          Register SrcReg = J.second;
           if (Idx != 0) {
             MI.getOperand(Idx).setReg(SrcReg);
             MI.getOperand(Idx + 1).setMBB(SrcBB);
@@ -531,8 +530,7 @@ void TailDuplicator::updateSuccessorsPHIs(
         }
       } else {
         // Live in tail block, must also be live in predecessors.
-        for (unsigned j = 0, ee = TDBBs.size(); j != ee; ++j) {
-          MachineBasicBlock *SrcBB = TDBBs[j];
+        for (MachineBasicBlock *SrcBB : TDBBs) {
           if (Idx != 0) {
             MI.getOperand(Idx).setReg(Reg);
             MI.getOperand(Idx + 1).setMBB(SrcBB);
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 5119dac36713..3f22cc4289f2 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -436,7 +436,7 @@ MachineInstr &TargetInstrInfo::duplicate(MachineBasicBlock &MBB,
     MachineBasicBlock::iterator InsertBefore, const MachineInstr &Orig) const {
   assert(!Orig.isNotDuplicable() && "Instruction cannot be duplicated");
   MachineFunction &MF = *MBB.getParent();
-  return MF.CloneMachineInstrBundle(MBB, InsertBefore, Orig);
+  return MF.cloneMachineInstrBundle(MBB, InsertBefore, Orig);
 }
 
 // If the COPY instruction in MI can be folded to a stack operation, return
@@ -1418,3 +1418,16 @@ void TargetInstrInfo::mergeOutliningCandidateAttributes(
       }))
     F.addFnAttr(Attribute::NoUnwind);
 }
+
+bool TargetInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
+                                             unsigned &Flags) const {
+  // Some instrumentations create special TargetOpcode at the start which
+  // expands to special code sequences which must be present.
+  auto First = MBB.getFirstNonDebugInstr();
+  if (First != MBB.end() &&
+      (First->getOpcode() == TargetOpcode::FENTRY_CALL ||
+       First->getOpcode() == TargetOpcode::PATCHABLE_FUNCTION_ENTER))
+    return false;
+
+  return true;
+}
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c0a7efff9e98..6fc6881f8736 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1187,7 +1187,7 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
   // all stack slots), but we need to handle the different type of stackmap
   // operands and memory effects here.
 
-  if (!llvm::any_of(MI->operands(),
+  if (llvm::none_of(MI->operands(),
                     [](MachineOperand &Operand) { return Operand.isFI(); }))
     return MBB;
 
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index f4bb71535f7f..f5cb518fce3e 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -248,8 +248,8 @@ static void getAllocatableSetForRC(const MachineFunction &MF,
                                    const TargetRegisterClass *RC, BitVector &R){
   assert(RC->isAllocatable() && "invalid for nonallocatable sets");
   ArrayRef<MCPhysReg> Order = RC->getRawAllocationOrder(MF);
-  for (unsigned i = 0; i != Order.size(); ++i)
-    R.set(Order[i]);
+  for (MCPhysReg PR : Order)
+    R.set(PR);
 }
 
 BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
index c9a19948ff2f..3426a03b6083 100644
--- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -144,23 +144,22 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
   }
 
   // Actually remove the blocks now.
-  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) {
+  for (MachineBasicBlock *BB : DeadBlocks) {
     // Remove any call site information for calls in the block.
-    for (auto &I : DeadBlocks[i]->instrs())
+    for (auto &I : BB->instrs())
       if (I.shouldUpdateCallSiteInfo())
-        DeadBlocks[i]->getParent()->eraseCallSiteInfo(&I);
+        BB->getParent()->eraseCallSiteInfo(&I);
 
-    DeadBlocks[i]->eraseFromParent();
+    BB->eraseFromParent();
   }
 
   // Cleanup PHI nodes.
-  for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    MachineBasicBlock *BB = &*I;
+  for (MachineBasicBlock &BB : F) {
     // Prune unneeded PHI entries.
-    SmallPtrSet<MachineBasicBlock*, 8> preds(BB->pred_begin(),
-                                             BB->pred_end());
-    MachineBasicBlock::iterator phi = BB->begin();
-    while (phi != BB->end() && phi->isPHI()) {
+    SmallPtrSet<MachineBasicBlock*, 8> preds(BB.pred_begin(),
+                                             BB.pred_end());
+    MachineBasicBlock::iterator phi = BB.begin();
+    while (phi != BB.end() && phi->isPHI()) {
       for (unsigned i = phi->getNumOperands() - 1; i >= 2; i-=2)
         if (!preds.count(phi->getOperand(i).getMBB())) {
           phi->RemoveOperand(i);
@@ -189,7 +188,7 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
             // insert a COPY instead of simply replacing the output
             // with the input.
             const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo();
-            BuildMI(*BB, BB->getFirstNonPHI(), phi->getDebugLoc(),
+            BuildMI(BB, BB.getFirstNonPHI(), phi->getDebugLoc(),
                     TII->get(TargetOpcode::COPY), OutputReg)
                 .addReg(InputReg, getRegState(Input), InputSub);
           }
diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
new file mode 100644
index 000000000000..cbc5d9ec169b
--- /dev/null
+++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
@@ -0,0 +1,1009 @@
+//===- VLIWMachineScheduler.cpp - VLIW-Focused Scheduling Pass ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/VLIWMachineScheduler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iomanip>
+#include <limits>
+#include <memory>
+#include <sstream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure", cl::Hidden,
+                                         cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> UseNewerCandidate("use-newer-candidate", cl::Hidden,
+                                       cl::ZeroOrMore, cl::init(true));
+
+static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level",
+                                                cl::Hidden, cl::ZeroOrMore,
+                                                cl::init(1));
+
+// Check if the scheduler should penalize instructions that are available to
+// early due to a zero-latency dependence.
+static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden,
+                                     cl::ZeroOrMore, cl::init(true));
+
+// This value is used to determine if a register class is a high pressure set.
+// We compute the maximum number of registers needed and divided by the total
+// available. Then, we compare the result to this value.
+static cl::opt<float> RPThreshold("vliw-misched-reg-pressure", cl::Hidden,
+                                  cl::init(0.75f),
+                                  cl::desc("High register pressure threhold."));
+
+VLIWResourceModel::VLIWResourceModel(const TargetSubtargetInfo &STI,
+                                     const TargetSchedModel *SM)
+    : TII(STI.getInstrInfo()), SchedModel(SM) {
+  ResourcesModel = createPacketizer(STI);
+
+  // This hard requirement could be relaxed,
+  // but for now do not let it proceed.
+  assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+  Packet.reserve(SchedModel->getIssueWidth());
+  Packet.clear();
+  ResourcesModel->clearResources();
+}
+
+void VLIWResourceModel::reset() {
+  Packet.clear();
+  ResourcesModel->clearResources();
+}
+
+VLIWResourceModel::~VLIWResourceModel() { delete ResourcesModel; }
+
+/// Return true if there is a dependence between SUd and SUu.
+bool VLIWResourceModel::hasDependence(const SUnit *SUd, const SUnit *SUu) {
+  if (SUd->Succs.size() == 0)
+    return false;
+
+  for (const auto &S : SUd->Succs) {
+    // Since we do not add pseudos to packets, might as well
+    // ignore order dependencies.
+    if (S.isCtrl())
+      continue;
+
+    if (S.getSUnit() == SUu && S.getLatency() > 0)
+      return true;
+  }
+  return false;
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+/// It is _not_ precise (statefull), it is more like
+/// another heuristic. Many corner cases are figured
+/// empirically.
+bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) {
+  if (!SU || !SU->getInstr())
+    return false;
+
+  // First see if the pipeline could receive this instruction
+  // in the current cycle.
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    if (!ResourcesModel->canReserveResources(*SU->getInstr()))
+      return false;
+    break;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+  case TargetOpcode::INLINEASM_BR:
+    break;
+  }
+
+  // Now see if there are no other dependencies to instructions already
+  // in the packet.
+  if (IsTop) {
+    for (unsigned i = 0, e = Packet.size(); i != e; ++i)
+      if (hasDependence(Packet[i], SU))
+        return false;
+  } else {
+    for (unsigned i = 0, e = Packet.size(); i != e; ++i)
+      if (hasDependence(SU, Packet[i]))
+        return false;
+  }
+  return true;
+}
+
+/// Keep track of available resources.
+bool VLIWResourceModel::reserveResources(SUnit *SU, bool IsTop) {
+  bool startNewCycle = false;
+  // Artificially reset state.
+  if (!SU) {
+    reset();
+    TotalPackets++;
+    return false;
+  }
+  // If this SU does not fit in the packet or the packet is now full
+  // start a new one.
+  if (!isResourceAvailable(SU, IsTop) ||
+      Packet.size() >= SchedModel->getIssueWidth()) {
+    reset();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    ResourcesModel->reserveResources(*SU->getInstr());
+    break;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+  case TargetOpcode::INLINEASM_BR:
+    break;
+  }
+  Packet.push_back(SU);
+
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    LLVM_DEBUG(dbgs() << "\t[" << i << "] SU(");
+    LLVM_DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
+    LLVM_DEBUG(Packet[i]->getInstr()->dump());
+  }
+#endif
+
+  return startNewCycle;
+}
+
+DFAPacketizer *
+VLIWResourceModel::createPacketizer(const TargetSubtargetInfo &STI) const {
+  return STI.getInstrInfo()->CreateTargetScheduleState(STI);
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+void VLIWMachineScheduler::schedule() {
+  LLVM_DEBUG(dbgs() << "********** MI Converging Scheduling VLIW "
+                    << printMBBReference(*BB) << " " << BB->getName()
+                    << " in_func " << BB->getParent()->getName()
+                    << " at loop depth " << MLI->getLoopDepth(BB) << " \n");
+
+  buildDAGWithRegPressure();
+
+  Topo.InitDAGTopologicalSorting();
+
+  // Postprocess the DAG to add platform-specific artificial dependencies.
+  postprocessDAG();
+
+  SmallVector<SUnit *, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  // Initialize the strategy before modifying the DAG.
+  SchedImpl->initialize(this);
+
+  LLVM_DEBUG({
+    unsigned maxH = 0;
+    for (const SUnit &SU : SUnits)
+      if (SU.getHeight() > maxH)
+        maxH = SU.getHeight();
+    dbgs() << "Max Height " << maxH << "\n";
+  });
+  LLVM_DEBUG({
+    unsigned maxD = 0;
+    for (const SUnit &SU : SUnits)
+      if (SU.getDepth() > maxD)
+        maxD = SU.getDepth();
+    dbgs() << "Max Depth " << maxD << "\n";
+  });
+  LLVM_DEBUG(dump());
+  if (ViewMISchedDAGs)
+    viewGraph();
+
+  initQueues(TopRoots, BotRoots);
+
+  bool IsTopNode = false;
+  while (true) {
+    LLVM_DEBUG(
+        dbgs() << "** VLIWMachineScheduler::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU)
+      break;
+
+    if (!checkSchedLimit())
+      break;
+
+    scheduleMI(SU, IsTopNode);
+
+    // Notify the scheduling strategy after updating the DAG.
+    SchedImpl->schedNode(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  LLVM_DEBUG({
+    dbgs() << "*** Final schedule for "
+           << printMBBReference(*begin()->getParent()) << " ***\n";
+    dumpSchedule();
+    dbgs() << '\n';
+  });
+}
+
+void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
+  DAG = static_cast<VLIWMachineScheduler *>(dag);
+  SchedModel = DAG->getSchedModel();
+
+  Top.init(DAG, SchedModel);
+  Bot.init(DAG, SchedModel);
+
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+  // are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries();
+  const TargetSubtargetInfo &STI = DAG->MF.getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  delete Top.HazardRec;
+  delete Bot.HazardRec;
+  Top.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+  delete Top.ResourceModel;
+  delete Bot.ResourceModel;
+  Top.ResourceModel = createVLIWResourceModel(STI, DAG->getSchedModel());
+  Bot.ResourceModel = createVLIWResourceModel(STI, DAG->getSchedModel());
+
+  const std::vector<unsigned> &MaxPressure =
+      DAG->getRegPressure().MaxSetPressure;
+  HighPressureSets.assign(MaxPressure.size(), 0);
+  for (unsigned i = 0, e = MaxPressure.size(); i < e; ++i) {
+    unsigned Limit = DAG->getRegClassInfo()->getRegPressureSetLimit(i);
+    HighPressureSets[i] =
+        ((float)MaxPressure[i] > ((float)Limit * RPThreshold));
+  }
+
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+}
+
+VLIWResourceModel *ConvergingVLIWScheduler::createVLIWResourceModel(
+    const TargetSubtargetInfo &STI, const TargetSchedModel *SchedModel) const {
+  return new VLIWResourceModel(STI, SchedModel);
+}
+
+void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
+  for (const SDep &PI : SU->Preds) {
+    unsigned PredReadyCycle = PI.getSUnit()->TopReadyCycle;
+    unsigned MinLatency = PI.getLatency();
+#ifndef NDEBUG
+    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
+#endif
+    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+      SU->TopReadyCycle = PredReadyCycle + MinLatency;
+  }
+
+  if (!SU->isScheduled)
+    Top.releaseNode(SU, SU->TopReadyCycle);
+}
+
+void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); I != E;
+       ++I) {
+    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+    unsigned MinLatency = I->getLatency();
+#ifndef NDEBUG
+    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
+#endif
+    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
+  }
+
+  if (!SU->isScheduled)
+    Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+ConvergingVLIWScheduler::VLIWSchedBoundary::~VLIWSchedBoundary() {
+  delete ResourceModel;
+  delete HazardRec;
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingVLIWScheduler::VLIWSchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled())
+    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+  if (IssueCount + uops > SchedModel->getIssueWidth())
+    return true;
+
+  return false;
+}
+
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releaseNode(
+    SUnit *SU, unsigned ReadyCycle) {
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  if (ReadyCycle > CurrCycle || checkHazard(SU))
+
+    Pending.push(SU);
+  else
+    Available.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() {
+  unsigned Width = SchedModel->getIssueWidth();
+  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+  assert(MinReadyCycle < std::numeric_limits<unsigned>::max() &&
+         "MinReadyCycle uninitialized");
+  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  } else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
+    }
+  }
+  CheckPending = true;
+
+  LLVM_DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle "
+                    << CurrCycle << '\n');
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) {
+  bool startNewCycle = false;
+
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
+    }
+    HazardRec->EmitInstruction(SU);
+  }
+
+  // Update DFA model.
+  startNewCycle = ResourceModel->reserveResources(SU, isTop());
+
+  // Check the instruction group dispatch limit.
+  // TODO: Check if this SU must end a dispatch group.
+  IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
+  if (startNewCycle) {
+    LLVM_DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    bumpCycle();
+  } else
+    LLVM_DEBUG(dbgs() << "*** IssueCount " << IssueCount << " at cycle "
+                      << CurrCycle << '\n');
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = std::numeric_limits<unsigned>::max();
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin() + i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin() + i);
+    --i;
+    --e;
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// advance the cycle until at least one node is ready. If multiple instructions
+/// are ready, return NULL.
+SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  auto AdvanceCycle = [this]() {
+    if (Available.empty())
+      return true;
+    if (Available.size() == 1 && Pending.size() > 0)
+      return !ResourceModel->isResourceAvailable(*Available.begin(), isTop()) ||
+             getWeakLeft(*Available.begin(), isTop()) != 0;
+    return false;
+  };
+  for (unsigned i = 0; AdvanceCycle(); ++i) {
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+           "permanent hazard");
+    (void)i;
+    ResourceModel->reserveResources(nullptr, isTop());
+    bumpCycle();
+    releasePending();
+  }
+  if (Available.size() == 1)
+    return *Available.begin();
+  return nullptr;
+}
+
+#ifndef NDEBUG
+void ConvergingVLIWScheduler::traceCandidate(const char *Label,
+                                             const ReadyQueue &Q, SUnit *SU,
+                                             int Cost, PressureChange P) {
+  dbgs() << Label << " " << Q.getName() << " ";
+  if (P.isValid())
+    dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":"
+           << P.getUnitInc() << " ";
+  else
+    dbgs() << "     ";
+  dbgs() << "cost(" << Cost << ")\t";
+  DAG->dumpNode(*SU);
+}
+
+// Very detailed queue dump, to be used with higher verbosity levels.
+void ConvergingVLIWScheduler::readyQueueVerboseDump(
+    const RegPressureTracker &RPTracker, SchedCandidate &Candidate,
+    ReadyQueue &Q) {
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
+
+  dbgs() << ">>> " << Q.getName() << "\n";
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+    std::stringstream dbgstr;
+    dbgstr << "SU(" << std::setw(3) << (*I)->NodeNum << ")";
+    dbgs() << dbgstr.str();
+    SchedulingCost(Q, *I, Candidate, RPDelta, true);
+    dbgs() << "\t";
+    (*I)->getInstr()->dump();
+  }
+  dbgs() << "\n";
+}
+#endif
+
+/// isSingleUnscheduledPred - If SU2 is the only unscheduled predecessor
+/// of SU, return true (we may have duplicates)
+static inline bool isSingleUnscheduledPred(SUnit *SU, SUnit *SU2) {
+  if (SU->NumPredsLeft == 0)
+    return false;
+
+  for (auto &Pred : SU->Preds) {
+    // We found an available, but not scheduled, predecessor.
+    if (!Pred.getSUnit()->isScheduled && (Pred.getSUnit() != SU2))
+      return false;
+  }
+
+  return true;
+}
+
+/// isSingleUnscheduledSucc - If SU2 is the only unscheduled successor
+/// of SU, return true (we may have duplicates)
+static inline bool isSingleUnscheduledSucc(SUnit *SU, SUnit *SU2) {
+  if (SU->NumSuccsLeft == 0)
+    return false;
+
+  for (auto &Succ : SU->Succs) {
+    // We found an available, but not scheduled, successor.
+    if (!Succ.getSUnit()->isScheduled && (Succ.getSUnit() != SU2))
+      return false;
+  }
+  return true;
+}
+
+/// Check if the instruction changes the register pressure of a register in the
+/// high pressure set. The function returns a negative value if the pressure
+/// decreases and a positive value is the pressure increases. If the instruction
+/// doesn't use a high pressure register or doesn't change the register
+/// pressure, then return 0.
+int ConvergingVLIWScheduler::pressureChange(const SUnit *SU, bool isBotUp) {
+  PressureDiff &PD = DAG->getPressureDiff(SU);
+  for (auto &P : PD) {
+    if (!P.isValid())
+      continue;
+    // The pressure differences are computed bottom-up, so the comparision for
+    // an increase is positive in the bottom direction, but negative in the
+    //  top-down direction.
+    if (HighPressureSets[P.getPSet()])
+      return (isBotUp ? P.getUnitInc() : -P.getUnitInc());
+  }
+  return 0;
+}
+
+/// Single point to compute overall scheduling cost.
+/// TODO: More heuristics will be used soon.
+int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
+                                            SchedCandidate &Candidate,
+                                            RegPressureDelta &Delta,
+                                            bool verbose) {
+  // Initial trivial priority.
+  int ResCount = 1;
+
+  // Do not waste time on a node that is already scheduled.
+  if (!SU || SU->isScheduled)
+    return ResCount;
+
+  LLVM_DEBUG(if (verbose) dbgs()
+             << ((Q.getID() == TopQID) ? "(top|" : "(bot|"));
+  // Forced priority is high.
+  if (SU->isScheduleHigh) {
+    ResCount += PriorityOne;
+    LLVM_DEBUG(dbgs() << "H|");
+  }
+
+  unsigned IsAvailableAmt = 0;
+  // Critical path first.
+  if (Q.getID() == TopQID) {
+    if (Top.isLatencyBound(SU)) {
+      LLVM_DEBUG(if (verbose) dbgs() << "LB|");
+      ResCount += (SU->getHeight() * ScaleTwo);
+    }
+
+    LLVM_DEBUG(if (verbose) {
+      std::stringstream dbgstr;
+      dbgstr << "h" << std::setw(3) << SU->getHeight() << "|";
+      dbgs() << dbgstr.str();
+    });
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Top.ResourceModel->isResourceAvailable(SU, true)) {
+      IsAvailableAmt = (PriorityTwo + PriorityThree);
+      ResCount += IsAvailableAmt;
+      LLVM_DEBUG(if (verbose) dbgs() << "A|");
+    } else
+      LLVM_DEBUG(if (verbose) dbgs() << " |");
+  } else {
+    if (Bot.isLatencyBound(SU)) {
+      LLVM_DEBUG(if (verbose) dbgs() << "LB|");
+      ResCount += (SU->getDepth() * ScaleTwo);
+    }
+
+    LLVM_DEBUG(if (verbose) {
+      std::stringstream dbgstr;
+      dbgstr << "d" << std::setw(3) << SU->getDepth() << "|";
+      dbgs() << dbgstr.str();
+    });
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Bot.ResourceModel->isResourceAvailable(SU, false)) {
+      IsAvailableAmt = (PriorityTwo + PriorityThree);
+      ResCount += IsAvailableAmt;
+      LLVM_DEBUG(if (verbose) dbgs() << "A|");
+    } else
+      LLVM_DEBUG(if (verbose) dbgs() << " |");
+  }
+
+  unsigned NumNodesBlocking = 0;
+  if (Q.getID() == TopQID) {
+    // How many SUs does it block from scheduling?
+    // Look at all of the successors of this node.
+    // Count the number of nodes that
+    // this node is the sole unscheduled node for.
+    if (Top.isLatencyBound(SU))
+      for (const SDep &SI : SU->Succs)
+        if (isSingleUnscheduledPred(SI.getSUnit(), SU))
+          ++NumNodesBlocking;
+  } else {
+    // How many unscheduled predecessors block this node?
+    if (Bot.isLatencyBound(SU))
+      for (const SDep &PI : SU->Preds)
+        if (isSingleUnscheduledSucc(PI.getSUnit(), SU))
+          ++NumNodesBlocking;
+  }
+  ResCount += (NumNodesBlocking * ScaleTwo);
+
+  LLVM_DEBUG(if (verbose) {
+    std::stringstream dbgstr;
+    dbgstr << "blk " << std::setw(2) << NumNodesBlocking << ")|";
+    dbgs() << dbgstr.str();
+  });
+
+  // Factor in reg pressure as a heuristic.
+  if (!IgnoreBBRegPressure) {
+    // Decrease priority by the amount that register pressure exceeds the limit.
+    ResCount -= (Delta.Excess.getUnitInc() * PriorityOne);
+    // Decrease priority if register pressure exceeds the limit.
+    ResCount -= (Delta.CriticalMax.getUnitInc() * PriorityOne);
+    // Decrease priority slightly if register pressure would increase over the
+    // current maximum.
+    ResCount -= (Delta.CurrentMax.getUnitInc() * PriorityTwo);
+    // If there are register pressure issues, then we remove the value added for
+    // the instruction being available. The rationale is that we really don't
+    // want to schedule an instruction that causes a spill.
+    if (IsAvailableAmt && pressureChange(SU, Q.getID() != TopQID) > 0 &&
+        (Delta.Excess.getUnitInc() || Delta.CriticalMax.getUnitInc() ||
+         Delta.CurrentMax.getUnitInc()))
+      ResCount -= IsAvailableAmt;
+    LLVM_DEBUG(if (verbose) {
+      dbgs() << "RP " << Delta.Excess.getUnitInc() << "/"
+             << Delta.CriticalMax.getUnitInc() << "/"
+             << Delta.CurrentMax.getUnitInc() << ")|";
+    });
+  }
+
+  // Give preference to a zero latency instruction if the dependent
+  // instruction is in the current packet.
+  if (Q.getID() == TopQID && getWeakLeft(SU, true) == 0) {
+    for (const SDep &PI : SU->Preds) {
+      if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() &&
+          PI.getLatency() == 0 &&
+          Top.ResourceModel->isInPacket(PI.getSUnit())) {
+        ResCount += PriorityThree;
+        LLVM_DEBUG(if (verbose) dbgs() << "Z|");
+      }
+    }
+  } else if (Q.getID() == BotQID && getWeakLeft(SU, false) == 0) {
+    for (const SDep &SI : SU->Succs) {
+      if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() &&
+          SI.getLatency() == 0 &&
+          Bot.ResourceModel->isInPacket(SI.getSUnit())) {
+        ResCount += PriorityThree;
+        LLVM_DEBUG(if (verbose) dbgs() << "Z|");
+      }
+    }
+  }
+
+  // If the instruction has a non-zero latency dependence with an instruction in
+  // the current packet, then it should not be scheduled yet. The case occurs
+  // when the dependent instruction is scheduled in a new packet, so the
+  // scheduler updates the current cycle and pending instructions become
+  // available.
+  if (CheckEarlyAvail) {
+    if (Q.getID() == TopQID) {
+      for (const auto &PI : SU->Preds) {
+        if (PI.getLatency() > 0 &&
+            Top.ResourceModel->isInPacket(PI.getSUnit())) {
+          ResCount -= PriorityOne;
+          LLVM_DEBUG(if (verbose) dbgs() << "D|");
+        }
+      }
+    } else {
+      for (const auto &SI : SU->Succs) {
+        if (SI.getLatency() > 0 &&
+            Bot.ResourceModel->isInPacket(SI.getSUnit())) {
+          ResCount -= PriorityOne;
+          LLVM_DEBUG(if (verbose) dbgs() << "D|");
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG(if (verbose) {
+    std::stringstream dbgstr;
+    dbgstr << "Total " << std::setw(4) << ResCount << ")";
+    dbgs() << dbgstr.str();
+  });
+
+  return ResCount;
+}
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+ConvergingVLIWScheduler::CandResult
+ConvergingVLIWScheduler::pickNodeFromQueue(VLIWSchedBoundary &Zone,
+                                           const RegPressureTracker &RPTracker,
+                                           SchedCandidate &Candidate) {
+  ReadyQueue &Q = Zone.Available;
+  LLVM_DEBUG(if (SchedDebugVerboseLevel > 1)
+                 readyQueueVerboseDump(RPTracker, Candidate, Q);
+             else Q.dump(););
+
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
+
+  // BestSU remains NULL if no top candidates beat the best existing candidate.
+  CandResult FoundCandidate = NoCand;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+
+    int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false);
+
+    // Initialize the candidate if needed.
+    if (!Candidate.SU) {
+      LLVM_DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = NodeOrder;
+      continue;
+    }
+
+    // Choose node order for negative cost candidates. There is no good
+    // candidate in this case.
+    if (CurrentCost < 0 && Candidate.SCost < 0) {
+      if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum) ||
+          (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
+        LLVM_DEBUG(traceCandidate("NCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = NodeOrder;
+      }
+      continue;
+    }
+
+    // Best cost.
+    if (CurrentCost > Candidate.SCost) {
+      LLVM_DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = BestCost;
+      continue;
+    }
+
+    // Choose an instruction that does not depend on an artificial edge.
+    unsigned CurrWeak = getWeakLeft(*I, (Q.getID() == TopQID));
+    unsigned CandWeak = getWeakLeft(Candidate.SU, (Q.getID() == TopQID));
+    if (CurrWeak != CandWeak) {
+      if (CurrWeak < CandWeak) {
+        LLVM_DEBUG(traceCandidate("WCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = Weak;
+      }
+      continue;
+    }
+
+    if (CurrentCost == Candidate.SCost && Zone.isLatencyBound(*I)) {
+      unsigned CurrSize, CandSize;
+      if (Q.getID() == TopQID) {
+        CurrSize = (*I)->Succs.size();
+        CandSize = Candidate.SU->Succs.size();
+      } else {
+        CurrSize = (*I)->Preds.size();
+        CandSize = Candidate.SU->Preds.size();
+      }
+      if (CurrSize > CandSize) {
+        LLVM_DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = BestCost;
+      }
+      // Keep the old candidate if it's a better candidate. That is, don't use
+      // the subsequent tie breaker.
+      if (CurrSize != CandSize)
+        continue;
+    }
+
+    // Tie breaker.
+    // To avoid scheduling indeterminism, we need a tie breaker
+    // for the case when cost is identical for two nodes.
+    if (UseNewerCandidate && CurrentCost == Candidate.SCost) {
+      if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum) ||
+          (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
+        LLVM_DEBUG(traceCandidate("TCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = NodeOrder;
+        continue;
+      }
+    }
+
+    // Fall through to original instruction order.
+    // Only consider node order if Candidate was chosen from this Q.
+    if (FoundCandidate == NoCand)
+      continue;
+  }
+  return FoundCandidate;
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    LLVM_DEBUG(dbgs() << "Picked only Bottom\n");
+    IsTopNode = false;
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    LLVM_DEBUG(dbgs() << "Picked only Top\n");
+    IsTopNode = true;
+    return SU;
+  }
+  SchedCandidate BotCand;
+  // Prefer bottom scheduling when heuristics are silent.
+  CandResult BotResult =
+      pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+  assert(BotResult != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that provides the least increase in
+  // Excess pressure, we can immediately schedule from that Q.
+  //
+  // RegionCriticalPSets summarizes the pressure within the scheduled region and
+  // affects picking from either Q. If scheduling in one direction must
+  // increase pressure for one of the excess PSets, then schedule in that
+  // direction first to provide more freedom in the other direction.
+  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+    LLVM_DEBUG(dbgs() << "Prefered Bottom Node\n");
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  // Check if the top Q has a better candidate.
+  SchedCandidate TopCand;
+  CandResult TopResult =
+      pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+  assert(TopResult != NoCand && "failed to find the first candidate");
+
+  if (TopResult == SingleExcess || TopResult == SingleCritical) {
+    LLVM_DEBUG(dbgs() << "Prefered Top Node\n");
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // If either Q has a single candidate that minimizes pressure above the
+  // original region's pressure pick it.
+  if (BotResult == SingleMax) {
+    LLVM_DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n");
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  if (TopResult == SingleMax) {
+    LLVM_DEBUG(dbgs() << "Prefered Top Node SingleMax\n");
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  if (TopCand.SCost > BotCand.SCost) {
+    LLVM_DEBUG(dbgs() << "Prefered Top Node Cost\n");
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Otherwise prefer the bottom candidate in node order.
+  LLVM_DEBUG(dbgs() << "Prefered Bottom in Node order\n");
+  IsTopNode = false;
+  return BotCand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return nullptr;
+  }
+  SUnit *SU;
+  if (ForceTopDown) {
+    SU = Top.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate TopCand;
+      CandResult TopResult =
+          pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+      assert(TopResult != NoCand && "failed to find the first candidate");
+      (void)TopResult;
+      SU = TopCand.SU;
+    }
+    IsTopNode = true;
+  } else if (ForceBottomUp) {
+    SU = Bot.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate BotCand;
+      CandResult BotResult =
+          pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+      assert(BotResult != NoCand && "failed to find the first candidate");
+      (void)BotResult;
+      SU = BotCand.SU;
+    }
+    IsTopNode = false;
+  } else {
+    SU = pickNodeBidrectional(IsTopNode);
+  }
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  LLVM_DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+                    << " Scheduling instruction in cycle "
+                    << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << " ("
+                    << reportPackets() << ")\n";
+             DAG->dumpNode(*SU));
+  return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, VLIWMachineScheduler needs
+/// to update it's state based on the current cycle before MachineSchedStrategy
+/// does.
+void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    Top.bumpNode(SU);
+    SU->TopReadyCycle = Top.CurrCycle;
+  } else {
+    Bot.bumpNode(SU);
+    SU->BotReadyCycle = Bot.CurrCycle;
+  }
+}
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 4876b9e23717..0c42bef82005 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -201,9 +201,11 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::x86amx:  return Type::getX86_AMXTy(Context);
   case MVT::i64x8:   return IntegerType::get(Context, 512);
   case MVT::externref:
+    // pointer to opaque struct in addrspace(10)
     return PointerType::get(StructType::create(Context), 10);
   case MVT::funcref:
-    return PointerType::get(StructType::create(Context), 20);
+    // pointer to i8 addrspace(20)
+    return PointerType::get(Type::getInt8Ty(Context), 20);
   case MVT::v1i1:
     return FixedVectorType::get(Type::getInt1Ty(Context), 1);
   case MVT::v2i1:
diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp
index 4564aa1c1278..d31183e46d65 100644
--- a/llvm/lib/CodeGen/WinEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -573,9 +573,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
       const auto *CatchSwitch = cast<CatchSwitchInst>(Pad);
       int CatchState = -1, FollowerState = -1;
       SmallVector<const BasicBlock *, 4> CatchBlocks(CatchSwitch->handlers());
-      for (auto CBI = CatchBlocks.rbegin(), CBE = CatchBlocks.rend();
-           CBI != CBE; ++CBI, FollowerState = CatchState) {
-        const BasicBlock *CatchBlock = *CBI;
+      for (const BasicBlock *CatchBlock : llvm::reverse(CatchBlocks)) {
         // Create the entry for this catch with the appropriate handler
         // properties.
         const auto *Catch = cast<CatchPadInst>(CatchBlock->getFirstNonPHI());
@@ -591,6 +589,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
               Worklist.emplace_back(I, CatchState);
         // Remember this catch's state.
         FuncInfo.EHPadStateMap[Catch] = CatchState;
+        FollowerState = CatchState;
       }
       // Associate the catchswitch with the state of its first catch.
       assert(CatchSwitch->getNumHandlers());
@@ -601,11 +600,9 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
   // Step two: record the TryParentState of each state.  For cleanuppads that
   // don't have cleanuprets, we may need to infer this from their child pads,
   // so visit pads in descendant-most to ancestor-most order.
-  for (auto Entry = FuncInfo.ClrEHUnwindMap.rbegin(),
-            End = FuncInfo.ClrEHUnwindMap.rend();
-       Entry != End; ++Entry) {
+  for (ClrEHUnwindMapEntry &Entry : llvm::reverse(FuncInfo.ClrEHUnwindMap)) {
     const Instruction *Pad =
-        Entry->Handler.get<const BasicBlock *>()->getFirstNonPHI();
+        Entry.Handler.get<const BasicBlock *>()->getFirstNonPHI();
     // For most pads, the TryParentState is the state associated with the
     // unwind dest of exceptional exits from it.
     const BasicBlock *UnwindDest;
@@ -615,7 +612,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
       // that's not the unwind dest of exceptions escaping the catch.  Those
       // cases were already assigned a TryParentState in the first pass, so
       // skip them.
-      if (Entry->TryParentState != -1)
+      if (Entry.TryParentState != -1)
         continue;
       // Otherwise, get the unwind dest from the catchswitch.
       UnwindDest = Catch->getCatchSwitch()->getUnwindDest();
@@ -692,7 +689,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
       UnwindDestState = FuncInfo.EHPadStateMap[UnwindDest->getFirstNonPHI()];
     }
 
-    Entry->TryParentState = UnwindDestState;
+    Entry.TryParentState = UnwindDestState;
   }
 
   // Step three: transfer information from pads to invokes.
diff --git a/llvm/lib/CodeGen/XRayInstrumentation.cpp b/llvm/lib/CodeGen/XRayInstrumentation.cpp
index 11d1b309aa64..b66429d8a5bf 100644
--- a/llvm/lib/CodeGen/XRayInstrumentation.cpp
+++ b/llvm/lib/CodeGen/XRayInstrumentation.cpp
@@ -226,6 +226,7 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
     case Triple::ArchType::arm:
     case Triple::ArchType::thumb:
     case Triple::ArchType::aarch64:
+    case Triple::ArchType::hexagon:
     case Triple::ArchType::mips:
     case Triple::ArchType::mipsel:
     case Triple::ArchType::mips64:
diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp
index a3dec6c25e44..ae0859e1ecfd 100644
--- a/llvm/lib/DWARFLinker/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp
@@ -223,22 +223,21 @@ static void analyzeImportedModule(
     SysRoot = CU.getSysRoot();
   if (!SysRoot.empty() && Path.startswith(SysRoot))
     return;
-  if (Optional<DWARFFormValue> Val = DIE.find(dwarf::DW_AT_name))
-    if (Optional<const char *> Name = Val->getAsCString()) {
-      auto &Entry = (*ParseableSwiftInterfaces)[*Name];
-      // The prepend path is applied later when copying.
-      DWARFDie CUDie = CU.getOrigUnit().getUnitDIE();
-      SmallString<128> ResolvedPath;
-      if (sys::path::is_relative(Path))
-        resolveRelativeObjectPath(ResolvedPath, CUDie);
-      sys::path::append(ResolvedPath, Path);
-      if (!Entry.empty() && Entry != ResolvedPath)
-        ReportWarning(
-            Twine("Conflicting parseable interfaces for Swift Module ") +
-                *Name + ": " + Entry + " and " + Path,
-            DIE);
-      Entry = std::string(ResolvedPath.str());
-    }
+  Optional<const char*> Name = dwarf::toString(DIE.find(dwarf::DW_AT_name));
+  if (!Name)
+    return;
+  auto &Entry = (*ParseableSwiftInterfaces)[*Name];
+  // The prepend path is applied later when copying.
+  DWARFDie CUDie = CU.getOrigUnit().getUnitDIE();
+  SmallString<128> ResolvedPath;
+  if (sys::path::is_relative(Path))
+    resolveRelativeObjectPath(ResolvedPath, CUDie);
+  sys::path::append(ResolvedPath, Path);
+  if (!Entry.empty() && Entry != ResolvedPath)
+    ReportWarning(Twine("Conflicting parseable interfaces for Swift Module ") +
+                      *Name + ": " + Entry + " and " + Path,
+                  DIE);
+  Entry = std::string(ResolvedPath.str());
 }
 
 /// The distinct types of work performed by the work loop in
@@ -409,10 +408,10 @@ static bool dieNeedsChildrenToBeMeaningful(uint32_t Tag) {
 void DWARFLinker::cleanupAuxiliarryData(LinkContext &Context) {
   Context.clear();
 
-  for (auto I = DIEBlocks.begin(), E = DIEBlocks.end(); I != E; ++I)
-    (*I)->~DIEBlock();
-  for (auto I = DIELocs.begin(), E = DIELocs.end(); I != E; ++I)
-    (*I)->~DIELoc();
+  for (DIEBlock *I : DIEBlocks)
+    I->~DIEBlock();
+  for (DIELoc *I : DIELocs)
+    I->~DIELoc();
 
   DIEBlocks.clear();
   DIELocs.clear();
@@ -846,7 +845,7 @@ void DWARFLinker::assignAbbrev(DIEAbbrev &Abbrev) {
 unsigned DWARFLinker::DIECloner::cloneStringAttribute(
     DIE &Die, AttributeSpec AttrSpec, const DWARFFormValue &Val,
     const DWARFUnit &U, OffsetsStringPool &StringPool, AttributesInfo &Info) {
-  Optional<const char *> String = Val.getAsCString();
+  Optional<const char *> String = dwarf::toString(Val);
   if (!String)
     return 0;
 
@@ -1423,6 +1422,11 @@ DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
     Flags |= TF_InFunctionScope;
     if (!Info.InDebugMap && LLVM_LIKELY(!Update))
       Flags |= TF_SkipPC;
+  } else if (Abbrev->getTag() == dwarf::DW_TAG_variable) {
+    // Function-local globals could be in the debug map even when the function
+    // is not, e.g., inlined functions.
+    if ((Flags & TF_InFunctionScope) && Info.InDebugMap)
+      Flags &= ~TF_SkipPC;
   }
 
   for (const auto &AttrSpec : Abbrev->attributes()) {
diff --git a/llvm/lib/DWARFLinker/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
index 46e7457f2368..1ab6ead3b5f6 100644
--- a/llvm/lib/DWARFLinker/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
@@ -531,9 +531,7 @@ void DwarfStreamer::emitLineTableForUnit(MCDwarfLineTableParams Params,
 
   unsigned RowsSinceLastSequence = 0;
 
-  for (unsigned Idx = 0; Idx < Rows.size(); ++Idx) {
-    auto &Row = Rows[Idx];
-
+  for (DWARFDebugLine::Row &Row : Rows) {
     int64_t AddressDelta;
     if (Address == -1ULL) {
       MS->emitIntValue(dwarf::DW_LNS_extended_op, 1);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index c8331487f282..95135c95e8d2 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -1195,7 +1195,7 @@ void DWARFContext::addLocalsForDie(DWARFCompileUnit *CU, DWARFDie Subprogram,
             Die.getAttributeValueAsReferencedDie(DW_AT_abstract_origin))
       Die = Origin;
     if (auto NameAttr = Die.find(DW_AT_name))
-      if (Optional<const char *> Name = NameAttr->getAsCString())
+      if (Optional<const char *> Name = dwarf::toString(*NameAttr))
         Local.Name = *Name;
     if (auto Type = Die.getAttributeValueAsReferencedDie(DW_AT_type))
       Local.Size = getTypeSize(Type, getCUAddrSize());
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index bda41b1f34e9..f36d3f87257a 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -1331,8 +1331,8 @@ Optional<StringRef> DWARFDebugLine::LineTable::getSourceByIndex(uint64_t FileInd
   if (Kind == FileLineInfoKind::None || !Prologue.hasFileAtIndex(FileIndex))
     return None;
   const FileNameEntry &Entry = Prologue.getFileNameEntry(FileIndex);
-  if (Optional<const char *> source = Entry.Source.getAsCString())
-    return StringRef(*source);
+  if (auto E = dwarf::toString(Entry.Source))
+    return StringRef(*E);
   return None;
 }
 
@@ -1350,10 +1350,10 @@ bool DWARFDebugLine::Prologue::getFileNameByIndex(
   if (Kind == FileLineInfoKind::None || !hasFileAtIndex(FileIndex))
     return false;
   const FileNameEntry &Entry = getFileNameEntry(FileIndex);
-  Optional<const char *> Name = Entry.Name.getAsCString();
-  if (!Name)
+  auto E = dwarf::toString(Entry.Name);
+  if (!E)
     return false;
-  StringRef FileName = *Name;
+  StringRef FileName = *E;
   if (Kind == FileLineInfoKind::RawValue ||
       isPathAbsoluteOnWindowsOrPosix(FileName)) {
     Result = std::string(FileName);
@@ -1372,11 +1372,10 @@ bool DWARFDebugLine::Prologue::getFileNameByIndex(
     // relative names.
     if ((Entry.DirIdx != 0 || Kind != FileLineInfoKind::RelativeFilePath) &&
         Entry.DirIdx < IncludeDirectories.size())
-      IncludeDir = IncludeDirectories[Entry.DirIdx].getAsCString().getValue();
+      IncludeDir = dwarf::toStringRef(IncludeDirectories[Entry.DirIdx]);
   } else {
     if (0 < Entry.DirIdx && Entry.DirIdx <= IncludeDirectories.size())
-      IncludeDir =
-          IncludeDirectories[Entry.DirIdx - 1].getAsCString().getValue();
+      IncludeDir = dwarf::toStringRef(IncludeDirectories[Entry.DirIdx - 1]);
   }
 
   // For absolute paths only, include the compilation directory of compile unit.
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index cdffb36741c8..f39c7871d603 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -41,9 +41,7 @@ public:
 } // namespace
 
 static Error createResolverError(uint32_t Index, unsigned Kind) {
-  return createStringError(errc::invalid_argument,
-                           "Unable to resolve indirect address %u for: %s",
-                           Index, dwarf::LocListEncodingString(Kind).data());
+  return make_error<ResolverError>(Index, (dwarf::LoclistEntries)Kind);
 }
 
 Expected<Optional<DWARFLocationExpression>>
@@ -404,3 +402,10 @@ void DWARFDebugLoclists::dumpRange(uint64_t StartOffset, uint64_t Size,
     OS << '\n';
   }
 }
+
+void llvm::ResolverError::log(raw_ostream &OS) const {
+  OS << format("unable to resolve indirect address %u for: %s", Index,
+               dwarf::LocListEncodingString(Kind).data());
+}
+
+char llvm::ResolverError::ID;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
index 80ffd81b3403..7a81d7ff064b 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -194,13 +194,11 @@ Error DWARFDebugMacro::parseImpl(
       if (MacroContributionOffset == MacroToUnits.end())
         return createStringError(errc::invalid_argument,
                                  "Macro contribution of the unit not found");
-      Optional<uint64_t> StrOffset =
+      Expected<uint64_t> StrOffset =
           MacroContributionOffset->second->getStringOffsetSectionItem(
               Data.getULEB128(&Offset));
       if (!StrOffset)
-        return createStringError(
-            errc::invalid_argument,
-            "String offsets contribution of the unit not found");
+        return StrOffset.takeError();
       E.MacroStr =
           MacroContributionOffset->second->getStringExtractor().getCStr(
               &*StrOffset);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index ed50f2635738..5421b2d59a1b 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -89,7 +89,6 @@ static void dumpLocationList(raw_ostream &OS, const DWARFFormValue &FormValue,
   U->getLocationTable().dumpLocationList(&Offset, OS, U->getBaseAddress(), MRI,
                                          Ctx.getDWARFObj(), U, DumpOpts,
                                          Indent);
-  return;
 }
 
 static void dumpLocationExpr(raw_ostream &OS, const DWARFFormValue &FormValue,
@@ -105,7 +104,6 @@ static void dumpLocationExpr(raw_ostream &OS, const DWARFFormValue &FormValue,
                      Ctx.isLittleEndian(), 0);
   DWARFExpression(Data, U->getAddressByteSize(), U->getFormParams().Format)
       .print(OS, DumpOpts, MRI, U);
-  return;
 }
 
 static DWARFDie resolveReferencedType(DWARFDie D,
@@ -672,6 +670,8 @@ struct DWARFTypePrinter {
       return;
     if (D.getTag() == DW_TAG_subprogram)
       return;
+    if (D.getTag() == DW_TAG_lexical_block)
+      return;
     D = D.resolveTypeUnitReference();
     if (DWARFDie P = D.getParent())
       appendScopes(P);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
index d0fbd702e831..e19f5b8138fa 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
@@ -217,8 +217,8 @@ static void prettyPrintBaseTypeRef(DWARFUnit *U, raw_ostream &OS,
     if (DumpOpts.Verbose)
       OS << format("0x%08" PRIx64 " -> ", Operands[Operand]);
     OS << format("0x%08" PRIx64 ")", U->getOffset() + Operands[Operand]);
-    if (auto Name = Die.find(dwarf::DW_AT_name))
-      OS << " \"" << Name->getAsCString() << "\"";
+    if (auto Name = dwarf::toString(Die.find(dwarf::DW_AT_name)))
+      OS << " \"" << *Name << "\"";
   } else {
     OS << format(" <invalid base_type ref: 0x%" PRIx64 ">",
                  Operands[Operand]);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index cea0f63bbf81..86991a3949dd 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -613,50 +613,53 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
 }
 
 void DWARFFormValue::dumpString(raw_ostream &OS) const {
-  Optional<const char *> DbgStr = getAsCString();
-  if (DbgStr.hasValue()) {
+  if (auto DbgStr = dwarf::toString(*this)) {
     auto COS = WithColor(OS, HighlightColor::String);
     COS.get() << '"';
-    COS.get().write_escaped(DbgStr.getValue());
+    COS.get().write_escaped(*DbgStr);
     COS.get() << '"';
   }
 }
 
-Optional<const char *> DWARFFormValue::getAsCString() const {
+Expected<const char *> DWARFFormValue::getAsCString() const {
   if (!isFormClass(FC_String))
-    return None;
+    return make_error<StringError>("Invalid form for string attribute",
+                                   inconvertibleErrorCode());
   if (Form == DW_FORM_string)
     return Value.cstr;
   // FIXME: Add support for DW_FORM_GNU_strp_alt
   if (Form == DW_FORM_GNU_strp_alt || C == nullptr)
-    return None;
+    return make_error<StringError>("Unsupported form for string attribute",
+                                   inconvertibleErrorCode());
   uint64_t Offset = Value.uval;
-  if (Form == DW_FORM_line_strp) {
-    // .debug_line_str is tracked in the Context.
-    if (const char *Str = C->getLineStringExtractor().getCStr(&Offset))
-      return Str;
-    return None;
-  }
+  Optional<uint32_t> Index;
   if (Form == DW_FORM_GNU_str_index || Form == DW_FORM_strx ||
       Form == DW_FORM_strx1 || Form == DW_FORM_strx2 || Form == DW_FORM_strx3 ||
       Form == DW_FORM_strx4) {
     if (!U)
-      return None;
-    Optional<uint64_t> StrOffset = U->getStringOffsetSectionItem(Offset);
+      return make_error<StringError>("API limitation - string extraction not "
+                                     "available without a DWARFUnit",
+                                     inconvertibleErrorCode());
+    Expected<uint64_t> StrOffset = U->getStringOffsetSectionItem(Offset);
+    Index = Offset;
     if (!StrOffset)
-      return None;
+      return StrOffset.takeError();
     Offset = *StrOffset;
   }
   // Prefer the Unit's string extractor, because for .dwo it will point to
   // .debug_str.dwo, while the Context's extractor always uses .debug_str.
-  if (U) {
-    if (const char *Str = U->getStringExtractor().getCStr(&Offset))
-      return Str;
-    return None;
-  }
-  if (const char *Str = C->getStringExtractor().getCStr(&Offset))
+  DataExtractor StrData = Form == DW_FORM_line_strp
+                              ? C->getLineStringExtractor()
+                          : U ? U->getStringExtractor()
+                              : C->getStringExtractor();
+  if (const char *Str = StrData.getCStr(&Offset))
     return Str;
-  return None;
+  std::string Msg = FormEncodingString(Form).str();
+  if (Index)
+    Msg += (" uses index " + Twine(*Index) + ", but the referenced string").str();
+  Msg += (" offset " + Twine(Offset) + " is beyond .debug_str bounds").str();
+  return make_error<StringError>(Msg,
+      inconvertibleErrorCode());
 }
 
 Optional<uint64_t> DWARFFormValue::getAsAddress() const {
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 82c34f537036..eed0a60ec75e 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -214,13 +214,17 @@ DWARFUnit::getAddrOffsetSectionItem(uint32_t Index) const {
   return {{Address, Section}};
 }
 
-Optional<uint64_t> DWARFUnit::getStringOffsetSectionItem(uint32_t Index) const {
+Expected<uint64_t> DWARFUnit::getStringOffsetSectionItem(uint32_t Index) const {
   if (!StringOffsetsTableContribution)
-    return None;
+    return make_error<StringError>(
+        "DW_FORM_strx used without a valid string offsets table",
+        inconvertibleErrorCode());
   unsigned ItemSize = getDwarfStringOffsetsByteSize();
   uint64_t Offset = getStringOffsetsBase() + Index * ItemSize;
   if (StringOffsetSection.Data.size() < Offset + ItemSize)
-    return None;
+    return make_error<StringError>("DW_FORM_strx uses index " + Twine(Index) +
+                                       ", which is too large",
+                                   inconvertibleErrorCode());
   DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection,
                         isLittleEndian, 0);
   return DA.getRelocatedValue(ItemSize, &Offset);
@@ -603,7 +607,7 @@ bool DWARFUnit::parseDWO() {
     DWO->setAddrOffsetSection(AddrOffsetSection, *AddrOffsetSectionBase);
   if (getVersion() == 4) {
     auto DWORangesBase = UnitDie.getRangesBaseAttribute();
-    DWO->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0);
+    DWO->setRangesSection(RangeSection, DWORangesBase.getValueOr(0));
   }
 
   return true;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 7673a721c4ea..6424c2f59844 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -390,6 +390,9 @@ bool DWARFVerifier::handleDebugInfo() {
 
   OS << "Verifying non-dwo Units...\n";
   NumErrors += verifyUnits(DCtx.getNormalUnitsVector());
+
+  OS << "Verifying dwo Units...\n";
+  NumErrors += verifyUnits(DCtx.getDWOUnitsVector());
   return NumErrors == 0;
 }
 
@@ -400,10 +403,13 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
   if (!Die.isValid())
     return NumErrors;
 
+  DWARFUnit *Unit = Die.getDwarfUnit();
+
   auto RangesOrError = Die.getAddressRanges();
   if (!RangesOrError) {
     // FIXME: Report the error.
-    ++NumErrors;
+    if (!Unit->isDWOUnit())
+      ++NumErrors;
     llvm::consumeError(RangesOrError.takeError());
     return NumErrors;
   }
@@ -496,15 +502,18 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
   };
 
   const DWARFObject &DObj = DCtx.getDWARFObj();
+  DWARFUnit *U = Die.getDwarfUnit();
   const auto Attr = AttrValue.Attr;
   switch (Attr) {
   case DW_AT_ranges:
     // Make sure the offset in the DW_AT_ranges attribute is valid.
     if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
-      unsigned DwarfVersion = Die.getDwarfUnit()->getVersion();
+      unsigned DwarfVersion = U->getVersion();
       const DWARFSection &RangeSection = DwarfVersion < 5
                                              ? DObj.getRangesSection()
                                              : DObj.getRnglistsSection();
+      if (U->isDWOUnit() && RangeSection.Data.empty())
+        break;
       if (*SectionOffset >= RangeSection.Data.size())
         ReportError(
             "DW_AT_ranges offset is beyond " +
@@ -517,7 +526,7 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
   case DW_AT_stmt_list:
     // Make sure the offset in the DW_AT_stmt_list attribute is valid.
     if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
-      if (*SectionOffset >= DObj.getLineSection().Data.size())
+      if (*SectionOffset >= U->getLineSection().Data.size())
         ReportError("DW_AT_stmt_list offset is beyond .debug_line bounds: " +
                     llvm::formatv("{0:x8}", *SectionOffset));
       break;
@@ -525,9 +534,18 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
     ReportError("DIE has invalid DW_AT_stmt_list encoding:");
     break;
   case DW_AT_location: {
+    // FIXME: It might be nice if there's a way to walk location expressions
+    // without trying to resolve the address ranges - it'd be a more efficient
+    // API (since the API is currently unnecessarily resolving addresses for
+    // this use case which only wants to validate the expressions themselves) &
+    // then the expressions could be validated even if the addresses can't be
+    // resolved.
+    // That sort of API would probably look like a callback "for each
+    // expression" with some way to lazily resolve the address ranges when
+    // needed (& then the existing API used here could be built on top of that -
+    // using the callback API to build the data structure and return it).
     if (Expected<std::vector<DWARFLocationExpression>> Loc =
             Die.getLocations(DW_AT_location)) {
-      DWARFUnit *U = Die.getDwarfUnit();
       for (const auto &Entry : *Loc) {
         DataExtractor Data(toStringRef(Entry.Expr), DCtx.isLittleEndian(), 0);
         DWARFExpression Expression(Data, U->getAddressByteSize(),
@@ -539,8 +557,12 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
         if (Error || !Expression.verify(U))
           ReportError("DIE contains invalid DWARF expression:");
       }
-    } else
-      ReportError(toString(Loc.takeError()));
+    } else if (Error Err = handleErrors(
+                   Loc.takeError(), [&](std::unique_ptr<ResolverError> E) {
+                     return U->isDWOUnit() ? Error::success()
+                                           : Error(std::move(E));
+                   }))
+      ReportError(toString(std::move(Err)));
     break;
   }
   case DW_AT_specification:
@@ -576,7 +598,8 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
   case DW_AT_call_file:
   case DW_AT_decl_file: {
     if (auto FileIdx = AttrValue.Value.getAsUnsignedConstant()) {
-      DWARFUnit *U = Die.getDwarfUnit();
+      if (U->isDWOUnit() && !U->isTypeUnit())
+        break;
       const auto *LT = U->getContext().getLineTableForUnit(U);
       if (LT) {
         if (!LT->hasFileAtIndex(*FileIdx)) {
@@ -616,7 +639,6 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
                                             DWARFAttribute &AttrValue,
                                             ReferenceMap &LocalReferences,
                                             ReferenceMap &CrossUnitReferences) {
-  const DWARFObject &DObj = DCtx.getDWARFObj();
   auto DieCU = Die.getDwarfUnit();
   unsigned NumErrors = 0;
   const auto Form = AttrValue.Value.getForm();
@@ -667,51 +689,15 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     }
     break;
   }
-  case DW_FORM_strp: {
-    auto SecOffset = AttrValue.Value.getAsSectionOffset();
-    assert(SecOffset); // DW_FORM_strp is a section offset.
-    if (SecOffset && *SecOffset >= DObj.getStrSection().size()) {
-      ++NumErrors;
-      error() << "DW_FORM_strp offset beyond .debug_str bounds:\n";
-      dump(Die) << '\n';
-    }
-    break;
-  }
+  case DW_FORM_strp:
   case DW_FORM_strx:
   case DW_FORM_strx1:
   case DW_FORM_strx2:
   case DW_FORM_strx3:
   case DW_FORM_strx4: {
-    auto Index = AttrValue.Value.getRawUValue();
-    auto DieCU = Die.getDwarfUnit();
-    // Check that we have a valid DWARF v5 string offsets table.
-    if (!DieCU->getStringOffsetsTableContribution()) {
-      ++NumErrors;
-      error() << FormEncodingString(Form)
-              << " used without a valid string offsets table:\n";
-      dump(Die) << '\n';
-      break;
-    }
-    // Check that the index is within the bounds of the section.
-    unsigned ItemSize = DieCU->getDwarfStringOffsetsByteSize();
-    // Use a 64-bit type to calculate the offset to guard against overflow.
-    uint64_t Offset =
-        (uint64_t)DieCU->getStringOffsetsBase() + Index * ItemSize;
-    if (DObj.getStrOffsetsSection().Data.size() < Offset + ItemSize) {
-      ++NumErrors;
-      error() << FormEncodingString(Form) << " uses index "
-              << format("%" PRIu64, Index) << ", which is too large:\n";
-      dump(Die) << '\n';
-      break;
-    }
-    // Check that the string offset is valid.
-    uint64_t StringOffset = *DieCU->getStringOffsetSectionItem(Index);
-    if (StringOffset >= DObj.getStrSection().size()) {
+    if (Error E = AttrValue.Value.getAsCString().takeError()) {
       ++NumErrors;
-      error() << FormEncodingString(Form) << " uses index "
-              << format("%" PRIu64, Index)
-              << ", but the referenced string"
-                 " offset is beyond .debug_str bounds:\n";
+      error() << toString(std::move(E)) << ":\n";
       dump(Die) << '\n';
     }
     break;
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index b2c43b893cd3..6eef6f84ab40 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -531,7 +531,7 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath) {
             << LR->Locations.size() << "\n";
         Log << "    " << NumDwarfInlineInfos << " DWARF frames:\n";
         for (size_t Idx = 0; Idx < NumDwarfInlineInfos; ++Idx) {
-          const auto dii = DwarfInlineInfos.getFrame(Idx);
+          const auto &dii = DwarfInlineInfos.getFrame(Idx);
           Log << "    [" << Idx << "]: " << dii.FunctionName << " @ "
               << dii.FileName << ':' << dii.Line << '\n';
         }
@@ -551,7 +551,7 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath) {
             ++Idx) {
         const auto &gii = LR->Locations[Idx];
         if (Idx < NumDwarfInlineInfos) {
-          const auto dii = DwarfInlineInfos.getFrame(Idx);
+          const auto &dii = DwarfInlineInfos.getFrame(Idx);
           gsymFilename = LR->getSourceFile(Idx);
           // Verify function name
           if (dii.FunctionName.find(gii.Name.str()) != 0)
diff --git a/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp b/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
index 1a92e2cb7754..f9a763d724a8 100644
--- a/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
+++ b/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
@@ -343,15 +343,25 @@ Expected<FileBufferByteStream> MSFBuilder::commit(StringRef Path,
   Layout = std::move(*L);
 
   uint64_t FileSize = uint64_t(Layout.SB->BlockSize) * Layout.SB->NumBlocks;
-  if (FileSize > UINT32_MAX) {
-    // FIXME: Changing the BinaryStream classes to use 64-bit numbers lets
-    // us create PDBs larger than 4 GiB successfully. The file format is
-    // block-based and as long as each stream is small enough, PDBs larger than
-    // 4 GiB might work. Check if tools can handle these large PDBs, and if so
-    // add support for writing them.
+  // Ensure that the file size is under the limit for the specified block size.
+  if (FileSize > getMaxFileSizeFromBlockSize(Layout.SB->BlockSize)) {
+    msf_error_code error_code = [](uint32_t BlockSize) {
+      switch (BlockSize) {
+      case 8192:
+        return msf_error_code::size_overflow_8192;
+      case 16384:
+        return msf_error_code::size_overflow_16384;
+      case 32768:
+        return msf_error_code::size_overflow_32768;
+      default:
+        return msf_error_code::size_overflow_4096;
+      }
+    }(Layout.SB->BlockSize);
+
     return make_error<MSFError>(
-        msf_error_code::size_overflow,
-        formatv("File size would have been {0,1:N}", FileSize));
+        error_code,
+        formatv("File size {0,1:N} too large for current PDB page size {1}",
+                FileSize, Layout.SB->BlockSize));
   }
 
   auto OutFileOrError = FileOutputBuffer::create(Path, FileSize);
diff --git a/llvm/lib/DebugInfo/MSF/MSFError.cpp b/llvm/lib/DebugInfo/MSF/MSFError.cpp
index e42157e9d48e..9df2158423a4 100644
--- a/llvm/lib/DebugInfo/MSF/MSFError.cpp
+++ b/llvm/lib/DebugInfo/MSF/MSFError.cpp
@@ -28,8 +28,14 @@ public:
     case msf_error_code::insufficient_buffer:
       return "The buffer is not large enough to read the requested number of "
              "bytes.";
-    case msf_error_code::size_overflow:
+    case msf_error_code::size_overflow_4096:
       return "Output data is larger than 4 GiB.";
+    case msf_error_code::size_overflow_8192:
+      return "Output data is larger than 8 GiB.";
+    case msf_error_code::size_overflow_16384:
+      return "Output data is larger than 16 GiB.";
+    case msf_error_code::size_overflow_32768:
+      return "Output data is larger than 32 GiB.";
     case msf_error_code::not_writable:
       return "The specified stream is not writable.";
     case msf_error_code::no_stream:
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
index cde645236851..5c61530c470d 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
@@ -100,7 +100,7 @@ PDBFile::getStreamBlockList(uint32_t StreamIndex) const {
   return ContainerLayout.StreamMap[StreamIndex];
 }
 
-uint32_t PDBFile::getFileSize() const { return Buffer->getLength(); }
+uint64_t PDBFile::getFileSize() const { return Buffer->getLength(); }
 
 Expected<ArrayRef<uint8_t>> PDBFile::getBlockData(uint32_t BlockIndex,
                                                   uint32_t NumBytes) const {
diff --git a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
index fd9a0deb54d6..f9e67014477e 100644
--- a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
@@ -518,8 +518,8 @@ SymbolCache::findLineTable(uint16_t Modi) const {
                            const std::vector<LineTableEntry> &RHS) {
     return LHS[0].Addr < RHS[0].Addr;
   });
-  for (size_t I = 0; I < EntryList.size(); ++I)
-    llvm::append_range(ModuleLineTable, EntryList[I]);
+  for (std::vector<LineTableEntry> &I : EntryList)
+    llvm::append_range(ModuleLineTable, I);
 
   return ModuleLineTable;
 }
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index f3f09584fdc9..5ec79df17fed 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -20,6 +20,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/PDB/PDB.h"
 #include "llvm/DebugInfo/PDB/PDBContext.h"
+#include "llvm/Debuginfod/Debuginfod.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/MachO.h"
@@ -384,7 +385,14 @@ bool findDebugBinary(const std::vector<std::string> &DebugFileDirectory,
       }
     }
   }
-  return false;
+  // Try debuginfod client cache and known servers.
+  Expected<std::string> PathOrErr = getCachedOrDownloadDebuginfo(BuildID);
+  if (!PathOrErr) {
+    consumeError(PathOrErr.takeError());
+    return false;
+  }
+  Result = *PathOrErr;
+  return true;
 }
 
 } // end anonymous namespace
diff --git a/llvm/lib/Debuginfod/Debuginfod.cpp b/llvm/lib/Debuginfod/Debuginfod.cpp
new file mode 100644
index 000000000000..389b18fd62ac
--- /dev/null
+++ b/llvm/lib/Debuginfod/Debuginfod.cpp
@@ -0,0 +1,183 @@
+//===-- llvm/Debuginfod/Debuginfod.cpp - Debuginfod client library --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// This file defines the fetchInfo function, which retrieves
+/// any of the three supported artifact types: (executable, debuginfo, source
+/// file) associated with a build-id from debuginfod servers. If a source file
+/// is to be fetched, its absolute path must be specified in the Description
+/// argument to fetchInfo.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Debuginfod/Debuginfod.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Debuginfod/HTTPClient.h"
+#include "llvm/Support/CachePruning.h"
+#include "llvm/Support/Caching.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/xxhash.h"
+
+namespace llvm {
+static std::string uniqueKey(llvm::StringRef S) { return utostr(xxHash64(S)); }
+
+// Returns a binary BuildID as a normalized hex string.
+// Uses lowercase for compatibility with common debuginfod servers.
+static std::string buildIDToString(BuildIDRef ID) {
+  return llvm::toHex(ID, /*LowerCase=*/true);
+}
+
+Expected<SmallVector<StringRef>> getDefaultDebuginfodUrls() {
+  const char *DebuginfodUrlsEnv = std::getenv("DEBUGINFOD_URLS");
+  if (DebuginfodUrlsEnv == NULL)
+    return SmallVector<StringRef>();
+
+  SmallVector<StringRef> DebuginfodUrls;
+  StringRef(DebuginfodUrlsEnv).split(DebuginfodUrls, " ");
+  return DebuginfodUrls;
+}
+
+Expected<std::string> getDefaultDebuginfodCacheDirectory() {
+  if (const char *CacheDirectoryEnv = std::getenv("DEBUGINFOD_CACHE_PATH"))
+    return CacheDirectoryEnv;
+
+  SmallString<64> CacheDirectory;
+  if (!sys::path::cache_directory(CacheDirectory))
+    return createStringError(
+        errc::io_error, "Unable to determine appropriate cache directory.");
+  return std::string(CacheDirectory);
+}
+
+std::chrono::milliseconds getDefaultDebuginfodTimeout() {
+  long Timeout;
+  const char *DebuginfodTimeoutEnv = std::getenv("DEBUGINFOD_TIMEOUT");
+  if (DebuginfodTimeoutEnv &&
+      to_integer(StringRef(DebuginfodTimeoutEnv).trim(), Timeout, 10))
+    return std::chrono::milliseconds(Timeout * 1000);
+
+  return std::chrono::milliseconds(90 * 1000);
+}
+
+/// The following functions fetch a debuginfod artifact to a file in a local
+/// cache and return the cached file path. They first search the local cache,
+/// followed by the debuginfod servers.
+
+Expected<std::string> getCachedOrDownloadSource(BuildIDRef ID,
+                                                StringRef SourceFilePath) {
+  SmallString<64> UrlPath;
+  sys::path::append(UrlPath, sys::path::Style::posix, "buildid",
+                    buildIDToString(ID), "source",
+                    sys::path::convert_to_slash(SourceFilePath));
+  return getCachedOrDownloadArtifact(uniqueKey(UrlPath), UrlPath);
+}
+
+Expected<std::string> getCachedOrDownloadExecutable(BuildIDRef ID) {
+  SmallString<64> UrlPath;
+  sys::path::append(UrlPath, sys::path::Style::posix, "buildid",
+                    buildIDToString(ID), "executable");
+  return getCachedOrDownloadArtifact(uniqueKey(UrlPath), UrlPath);
+}
+
+Expected<std::string> getCachedOrDownloadDebuginfo(BuildIDRef ID) {
+  SmallString<64> UrlPath;
+  sys::path::append(UrlPath, sys::path::Style::posix, "buildid",
+                    buildIDToString(ID), "debuginfo");
+  return getCachedOrDownloadArtifact(uniqueKey(UrlPath), UrlPath);
+}
+
+// General fetching function.
+Expected<std::string> getCachedOrDownloadArtifact(StringRef UniqueKey,
+                                                  StringRef UrlPath) {
+  SmallString<10> CacheDir;
+
+  Expected<std::string> CacheDirOrErr = getDefaultDebuginfodCacheDirectory();
+  if (!CacheDirOrErr)
+    return CacheDirOrErr.takeError();
+  CacheDir = *CacheDirOrErr;
+
+  Expected<SmallVector<StringRef>> DebuginfodUrlsOrErr =
+      getDefaultDebuginfodUrls();
+  if (!DebuginfodUrlsOrErr)
+    return DebuginfodUrlsOrErr.takeError();
+  SmallVector<StringRef> &DebuginfodUrls = *DebuginfodUrlsOrErr;
+  return getCachedOrDownloadArtifact(UniqueKey, UrlPath, CacheDir,
+                                     DebuginfodUrls,
+                                     getDefaultDebuginfodTimeout());
+}
+
+Expected<std::string> getCachedOrDownloadArtifact(
+    StringRef UniqueKey, StringRef UrlPath, StringRef CacheDirectoryPath,
+    ArrayRef<StringRef> DebuginfodUrls, std::chrono::milliseconds Timeout) {
+  SmallString<64> AbsCachedArtifactPath;
+  sys::path::append(AbsCachedArtifactPath, CacheDirectoryPath,
+                    "llvmcache-" + UniqueKey);
+
+  Expected<FileCache> CacheOrErr =
+      localCache("Debuginfod-client", ".debuginfod-client", CacheDirectoryPath);
+  if (!CacheOrErr)
+    return CacheOrErr.takeError();
+
+  FileCache Cache = *CacheOrErr;
+  // We choose an arbitrary Task parameter as we do not make use of it.
+  unsigned Task = 0;
+  Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, UniqueKey);
+  if (!CacheAddStreamOrErr)
+    return CacheAddStreamOrErr.takeError();
+  AddStreamFn &CacheAddStream = *CacheAddStreamOrErr;
+  if (!CacheAddStream)
+    return std::string(AbsCachedArtifactPath);
+  // The artifact was not found in the local cache, query the debuginfod
+  // servers.
+  if (!HTTPClient::isAvailable())
+    return createStringError(errc::io_error,
+                             "No working HTTP client is available.");
+
+  if (!HTTPClient::IsInitialized)
+    return createStringError(
+        errc::io_error,
+        "A working HTTP client is available, but it is not initialized. To "
+        "allow Debuginfod to make HTTP requests, call HTTPClient::initialize() "
+        "at the beginning of main.");
+
+  HTTPClient Client;
+  Client.setTimeout(Timeout);
+  for (StringRef ServerUrl : DebuginfodUrls) {
+    SmallString<64> ArtifactUrl;
+    sys::path::append(ArtifactUrl, sys::path::Style::posix, ServerUrl, UrlPath);
+
+    Expected<HTTPResponseBuffer> ResponseOrErr = Client.get(ArtifactUrl);
+    if (!ResponseOrErr)
+      return ResponseOrErr.takeError();
+
+    HTTPResponseBuffer &Response = *ResponseOrErr;
+    if (Response.Code != 200)
+      continue;
+
+    // We have retrieved the artifact from this server, and now add it to the
+    // file cache.
+    Expected<std::unique_ptr<CachedFileStream>> FileStreamOrErr =
+        CacheAddStream(Task);
+    if (!FileStreamOrErr)
+      return FileStreamOrErr.takeError();
+    std::unique_ptr<CachedFileStream> &FileStream = *FileStreamOrErr;
+    if (!Response.Body)
+      return createStringError(
+          errc::io_error, "Unallocated MemoryBuffer in HTTPResponseBuffer.");
+
+    *FileStream->OS << StringRef(Response.Body->getBufferStart(),
+                                 Response.Body->getBufferSize());
+
+    // Return the path to the artifact on disk.
+    return std::string(AbsCachedArtifactPath);
+  }
+
+  return createStringError(errc::argument_out_of_domain, "build id not found");
+}
+} // namespace llvm
diff --git a/llvm/lib/Debuginfod/HTTPClient.cpp b/llvm/lib/Debuginfod/HTTPClient.cpp
new file mode 100644
index 000000000000..65f457933b92
--- /dev/null
+++ b/llvm/lib/Debuginfod/HTTPClient.cpp
@@ -0,0 +1,216 @@
+//===-- llvm/Debuginfod/HTTPClient.cpp - HTTP client library ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// This file defines the methods of the HTTPRequest, HTTPClient, and
+/// BufferedHTTPResponseHandler classes.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Debuginfod/HTTPClient.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#ifdef LLVM_ENABLE_CURL
+#include <curl/curl.h>
+#endif
+
+using namespace llvm;
+
+HTTPRequest::HTTPRequest(StringRef Url) { this->Url = Url.str(); }
+
+bool operator==(const HTTPRequest &A, const HTTPRequest &B) {
+  return A.Url == B.Url && A.Method == B.Method &&
+         A.FollowRedirects == B.FollowRedirects;
+}
+
+HTTPResponseHandler::~HTTPResponseHandler() = default;
+
+static inline bool parseContentLengthHeader(StringRef LineRef,
+                                            size_t &ContentLength) {
+  // Content-Length is a mandatory header, and the only one we handle.
+  return LineRef.consume_front("Content-Length: ") &&
+         to_integer(LineRef.trim(), ContentLength, 10);
+}
+
+Error BufferedHTTPResponseHandler::handleHeaderLine(StringRef HeaderLine) {
+  if (ResponseBuffer.Body)
+    return Error::success();
+
+  size_t ContentLength;
+  if (parseContentLengthHeader(HeaderLine, ContentLength))
+    ResponseBuffer.Body =
+        WritableMemoryBuffer::getNewUninitMemBuffer(ContentLength);
+
+  return Error::success();
+}
+
+Error BufferedHTTPResponseHandler::handleBodyChunk(StringRef BodyChunk) {
+  if (!ResponseBuffer.Body)
+    return createStringError(errc::io_error,
+                             "Unallocated response buffer. HTTP Body data "
+                             "received before Content-Length header.");
+  if (Offset + BodyChunk.size() > ResponseBuffer.Body->getBufferSize())
+    return createStringError(errc::io_error,
+                             "Content size exceeds buffer size.");
+  memcpy(ResponseBuffer.Body->getBufferStart() + Offset, BodyChunk.data(),
+         BodyChunk.size());
+  Offset += BodyChunk.size();
+  return Error::success();
+}
+
+Error BufferedHTTPResponseHandler::handleStatusCode(unsigned Code) {
+  ResponseBuffer.Code = Code;
+  return Error::success();
+}
+
+bool HTTPClient::IsInitialized = false;
+
+class HTTPClientCleanup {
+public:
+  ~HTTPClientCleanup() { HTTPClient::cleanup(); }
+};
+static const HTTPClientCleanup Cleanup;
+
+Expected<HTTPResponseBuffer> HTTPClient::perform(const HTTPRequest &Request) {
+  BufferedHTTPResponseHandler Handler;
+  if (Error Err = perform(Request, Handler))
+    return std::move(Err);
+  return std::move(Handler.ResponseBuffer);
+}
+
+Expected<HTTPResponseBuffer> HTTPClient::get(StringRef Url) {
+  HTTPRequest Request(Url);
+  return perform(Request);
+}
+
+#ifdef LLVM_ENABLE_CURL
+
+bool HTTPClient::isAvailable() { return true; }
+
+void HTTPClient::initialize() {
+  if (!IsInitialized) {
+    curl_global_init(CURL_GLOBAL_ALL);
+    IsInitialized = true;
+  }
+}
+
+void HTTPClient::cleanup() {
+  if (IsInitialized) {
+    curl_global_cleanup();
+    IsInitialized = false;
+  }
+}
+
+void HTTPClient::setTimeout(std::chrono::milliseconds Timeout) {
+  if (Timeout < std::chrono::milliseconds(0))
+    Timeout = std::chrono::milliseconds(0);
+  curl_easy_setopt(Curl, CURLOPT_TIMEOUT_MS, Timeout.count());
+}
+
+/// CurlHTTPRequest and the curl{Header,Write}Function are implementation
+/// details used to work with Curl. Curl makes callbacks with a single
+/// customizable pointer parameter.
+struct CurlHTTPRequest {
+  CurlHTTPRequest(HTTPResponseHandler &Handler) : Handler(Handler) {}
+  void storeError(Error Err) {
+    ErrorState = joinErrors(std::move(Err), std::move(ErrorState));
+  }
+  HTTPResponseHandler &Handler;
+  llvm::Error ErrorState = Error::success();
+};
+
+static size_t curlHeaderFunction(char *Contents, size_t Size, size_t NMemb,
+                                 CurlHTTPRequest *CurlRequest) {
+  assert(Size == 1 && "The Size passed by libCURL to CURLOPT_HEADERFUNCTION "
+                      "should always be 1.");
+  if (Error Err =
+          CurlRequest->Handler.handleHeaderLine(StringRef(Contents, NMemb))) {
+    CurlRequest->storeError(std::move(Err));
+    return 0;
+  }
+  return NMemb;
+}
+
+static size_t curlWriteFunction(char *Contents, size_t Size, size_t NMemb,
+                                CurlHTTPRequest *CurlRequest) {
+  Size *= NMemb;
+  if (Error Err =
+          CurlRequest->Handler.handleBodyChunk(StringRef(Contents, Size))) {
+    CurlRequest->storeError(std::move(Err));
+    return 0;
+  }
+  return Size;
+}
+
+HTTPClient::HTTPClient() {
+  assert(IsInitialized &&
+         "Must call HTTPClient::initialize() at the beginning of main().");
+  if (Curl)
+    return;
+  assert((Curl = curl_easy_init()) && "Curl could not be initialized.");
+  // Set the callback hooks.
+  curl_easy_setopt(Curl, CURLOPT_WRITEFUNCTION, curlWriteFunction);
+  curl_easy_setopt(Curl, CURLOPT_HEADERFUNCTION, curlHeaderFunction);
+}
+
+HTTPClient::~HTTPClient() { curl_easy_cleanup(Curl); }
+
+Error HTTPClient::perform(const HTTPRequest &Request,
+                          HTTPResponseHandler &Handler) {
+  if (Request.Method != HTTPMethod::GET)
+    return createStringError(errc::invalid_argument,
+                             "Unsupported CURL request method.");
+
+  SmallString<128> Url = Request.Url;
+  curl_easy_setopt(Curl, CURLOPT_URL, Url.c_str());
+  curl_easy_setopt(Curl, CURLOPT_FOLLOWLOCATION, Request.FollowRedirects);
+
+  CurlHTTPRequest CurlRequest(Handler);
+  curl_easy_setopt(Curl, CURLOPT_WRITEDATA, &CurlRequest);
+  curl_easy_setopt(Curl, CURLOPT_HEADERDATA, &CurlRequest);
+  CURLcode CurlRes = curl_easy_perform(Curl);
+  if (CurlRes != CURLE_OK)
+    return joinErrors(std::move(CurlRequest.ErrorState),
+                      createStringError(errc::io_error,
+                                        "curl_easy_perform() failed: %s\n",
+                                        curl_easy_strerror(CurlRes)));
+  if (CurlRequest.ErrorState)
+    return std::move(CurlRequest.ErrorState);
+
+  unsigned Code;
+  curl_easy_getinfo(Curl, CURLINFO_RESPONSE_CODE, &Code);
+  if (Error Err = Handler.handleStatusCode(Code))
+    return joinErrors(std::move(CurlRequest.ErrorState), std::move(Err));
+
+  return std::move(CurlRequest.ErrorState);
+}
+
+#else
+
+HTTPClient::HTTPClient() = default;
+
+HTTPClient::~HTTPClient() = default;
+
+bool HTTPClient::isAvailable() { return false; }
+
+void HTTPClient::initialize() {}
+
+void HTTPClient::cleanup() {}
+
+void HTTPClient::setTimeout(std::chrono::milliseconds Timeout) {}
+
+Error HTTPClient::perform(const HTTPRequest &Request,
+                          HTTPResponseHandler &Handler) {
+  llvm_unreachable("No HTTP Client implementation available.");
+}
+
+#endif
diff --git a/llvm/lib/Demangle/DLangDemangle.cpp b/llvm/lib/Demangle/DLangDemangle.cpp
index f380aa90035e..0cefbd63a7ae 100644
--- a/llvm/lib/Demangle/DLangDemangle.cpp
+++ b/llvm/lib/Demangle/DLangDemangle.cpp
@@ -242,11 +242,77 @@ const char *Demangler::parseIdentifier(OutputBuffer *Demangled,
 
   // TODO: Parse template instances with a length prefix.
 
+  // There can be multiple different declarations in the same function that
+  // have the same mangled name.  To make the mangled names unique, a fake
+  // parent in the form `__Sddd' is added to the symbol.
+  if (Len >= 4 && Mangled[0] == '_' && Mangled[1] == '_' && Mangled[2] == 'S') {
+    const char *NumPtr = Mangled + 3;
+    while (NumPtr < (Mangled + Len) && std::isdigit(*NumPtr))
+      ++NumPtr;
+
+    if (Mangled + Len == NumPtr) {
+      // Skip over the fake parent.
+      Mangled += Len;
+      return parseIdentifier(Demangled, Mangled);
+    }
+
+    // Else demangle it as a plain identifier.
+  }
+
   return parseLName(Demangled, Mangled, Len);
 }
 
 const char *Demangler::parseLName(OutputBuffer *Demangled, const char *Mangled,
                                   unsigned long Len) {
+  switch (Len) {
+  case 6:
+    if (strncmp(Mangled, "__initZ", Len + 1) == 0) {
+      // The static initializer for a given symbol.
+      Demangled->prepend("initializer for ");
+      Demangled->setCurrentPosition(Demangled->getCurrentPosition() - 1);
+      Mangled += Len;
+      return Mangled;
+    }
+    if (strncmp(Mangled, "__vtblZ", Len + 1) == 0) {
+      // The vtable symbol for a given class.
+      Demangled->prepend("vtable for ");
+      Demangled->setCurrentPosition(Demangled->getCurrentPosition() - 1);
+      Mangled += Len;
+      return Mangled;
+    }
+    break;
+
+  case 7:
+    if (strncmp(Mangled, "__ClassZ", Len + 1) == 0) {
+      // The classinfo symbol for a given class.
+      Demangled->prepend("ClassInfo for ");
+      Demangled->setCurrentPosition(Demangled->getCurrentPosition() - 1);
+      Mangled += Len;
+      return Mangled;
+    }
+    break;
+
+  case 11:
+    if (strncmp(Mangled, "__InterfaceZ", Len + 1) == 0) {
+      // The interface symbol for a given class.
+      Demangled->prepend("Interface for ");
+      Demangled->setCurrentPosition(Demangled->getCurrentPosition() - 1);
+      Mangled += Len;
+      return Mangled;
+    }
+    break;
+
+  case 12:
+    if (strncmp(Mangled, "__ModuleInfoZ", Len + 1) == 0) {
+      // The ModuleInfo symbol for a given module.
+      Demangled->prepend("ModuleInfo for ");
+      Demangled->setCurrentPosition(Demangled->getCurrentPosition() - 1);
+      Mangled += Len;
+      return Mangled;
+    }
+    break;
+  }
+
   *Demangled << StringView(Mangled, Len);
   Mangled += Len;
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index 3ea9ffee6554..27d8833ae19e 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -241,7 +241,9 @@ private:
     }
     case Branch32: {
       Kind = x86_64::BranchPCRel32;
-      Addend = 0;
+      // BranchPCRel32 implicitly handles the '-4' PC adjustment, so we have to
+      // adjust the addend by '+4' to compensate.
+      Addend += 4;
       break;
     }
     }
@@ -252,7 +254,7 @@ private:
     Edge GE(Kind, Offset, *GraphSymbol, Addend);
     LLVM_DEBUG({
       dbgs() << "    ";
-      printEdge(dbgs(), *BlockToFix, GE, getELFX86RelocationKindName(Kind));
+      printEdge(dbgs(), *BlockToFix, GE, x86_64::getEdgeKindName(Kind));
       dbgs() << "\n";
     });
 
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 200f42aec067..ed912280ac82 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -170,8 +170,8 @@ std::unique_ptr<MemoryBuffer> MCJIT::emitObject(Module *M) {
   PM.run(*M);
   // Flush the output buffer to get the generated code into memory
 
-  std::unique_ptr<MemoryBuffer> CompiledObjBuffer(
-      new SmallVectorMemoryBuffer(std::move(ObjBufferSV)));
+  auto CompiledObjBuffer = std::make_unique<SmallVectorMemoryBuffer>(
+      std::move(ObjBufferSV), /*RequiresNullTerminator=*/false);
 
   // If we have an object cache, tell it about the new object.
   // Note that we're using the compiled image, not the loaded image (as below).
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
index 52e7eda90310..a5dd420c9132 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -151,12 +151,8 @@ class MCJIT : public ExecutionEngine {
     }
 
     void markAllLoadedModulesAsFinalized() {
-      for (ModulePtrSet::iterator I = LoadedModules.begin(),
-                                  E = LoadedModules.end();
-           I != E; ++I) {
-        Module *M = *I;
+      for (Module *M : LoadedModules)
         FinalizedModules.insert(M);
-      }
       LoadedModules.clear();
     }
 
@@ -167,10 +163,8 @@ class MCJIT : public ExecutionEngine {
 
     void freeModulePtrSet(ModulePtrSet& MPS) {
       // Go through the module set and delete everything.
-      for (ModulePtrSet::iterator I = MPS.begin(), E = MPS.end(); I != E; ++I) {
-        Module *M = *I;
+      for (Module *M : MPS)
         delete M;
-      }
       MPS.clear();
     }
   };
diff --git a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index 9ff6cec8c6c5..e2a0cadb6348 100644
--- a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -78,11 +78,10 @@ public:
       : IRMaterializationUnit(ES, MO, std::move(TSM)), Parent(Parent) {}
 
   PartitioningIRMaterializationUnit(
-      ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
-      SymbolStringPtr InitSymbol, SymbolNameToDefinitionMap SymbolToDefinition,
+      ThreadSafeModule TSM, Interface I,
+      SymbolNameToDefinitionMap SymbolToDefinition,
       CompileOnDemandLayer &Parent)
-      : IRMaterializationUnit(std::move(TSM), std::move(SymbolFlags),
-                              std::move(InitSymbol),
+      : IRMaterializationUnit(std::move(TSM), std::move(I),
                               std::move(SymbolToDefinition)),
         Parent(Parent) {}
 
@@ -298,7 +297,9 @@ void CompileOnDemandLayer::emitPartition(
   if (GVsToExtract->empty()) {
     if (auto Err =
             R->replace(std::make_unique<PartitioningIRMaterializationUnit>(
-                std::move(TSM), R->getSymbols(), R->getInitializerSymbol(),
+                std::move(TSM),
+                MaterializationUnit::Interface(R->getSymbols(),
+                                               R->getInitializerSymbol()),
                 std::move(Defs), *this))) {
       getExecutionSession().reportError(std::move(Err));
       R->failMaterialization();
diff --git a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
index f8efed15edea..f34247005258 100644
--- a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
@@ -53,7 +53,8 @@ Expected<SimpleCompiler::CompileResult> SimpleCompiler::operator()(Module &M) {
   }
 
   auto ObjBuffer = std::make_unique<SmallVectorMemoryBuffer>(
-      std::move(ObjBufferSV), M.getModuleIdentifier() + "-jitted-objectbuffer");
+      std::move(ObjBufferSV), M.getModuleIdentifier() + "-jitted-objectbuffer",
+      /*RequiresNullTerminator=*/false);
 
   auto Obj = object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef());
 
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 56a97f83d915..aa82cf38c45d 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -243,8 +243,7 @@ void AsynchronousSymbolQuery::detach() {
 
 AbsoluteSymbolsMaterializationUnit::AbsoluteSymbolsMaterializationUnit(
     SymbolMap Symbols)
-    : MaterializationUnit(extractFlags(Symbols), nullptr),
-      Symbols(std::move(Symbols)) {}
+    : MaterializationUnit(extractFlags(Symbols)), Symbols(std::move(Symbols)) {}
 
 StringRef AbsoluteSymbolsMaterializationUnit::getName() const {
   return "<Absolute Symbols>";
@@ -263,18 +262,18 @@ void AbsoluteSymbolsMaterializationUnit::discard(const JITDylib &JD,
   Symbols.erase(Name);
 }
 
-SymbolFlagsMap
+MaterializationUnit::Interface
 AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) {
   SymbolFlagsMap Flags;
   for (const auto &KV : Symbols)
     Flags[KV.first] = KV.second.getFlags();
-  return Flags;
+  return MaterializationUnit::Interface(std::move(Flags), nullptr);
 }
 
 ReExportsMaterializationUnit::ReExportsMaterializationUnit(
     JITDylib *SourceJD, JITDylibLookupFlags SourceJDLookupFlags,
     SymbolAliasMap Aliases)
-    : MaterializationUnit(extractFlags(Aliases), nullptr), SourceJD(SourceJD),
+    : MaterializationUnit(extractFlags(Aliases)), SourceJD(SourceJD),
       SourceJDLookupFlags(SourceJDLookupFlags), Aliases(std::move(Aliases)) {}
 
 StringRef ReExportsMaterializationUnit::getName() const {
@@ -456,13 +455,13 @@ void ReExportsMaterializationUnit::discard(const JITDylib &JD,
   Aliases.erase(Name);
 }
 
-SymbolFlagsMap
+MaterializationUnit::Interface
 ReExportsMaterializationUnit::extractFlags(const SymbolAliasMap &Aliases) {
   SymbolFlagsMap SymbolFlags;
   for (auto &KV : Aliases)
     SymbolFlags[KV.first] = KV.second.AliasFlags;
 
-  return SymbolFlags;
+  return MaterializationUnit::Interface(std::move(SymbolFlags), nullptr);
 }
 
 Expected<SymbolAliasMap> buildSimpleReexportsAliasMap(JITDylib &SourceJD,
@@ -2492,10 +2491,19 @@ void ExecutionSession::OL_applyQueryPhase1(
       }
     }
 
-    // If we get here then we've moved on to the next JITDylib.
-    LLVM_DEBUG(dbgs() << "Phase 1 moving to next JITDylib.\n");
-    ++IPLS->CurSearchOrderIndex;
-    IPLS->NewJITDylib = true;
+    if (IPLS->DefGeneratorCandidates.empty() &&
+        IPLS->DefGeneratorNonCandidates.empty()) {
+      // Early out if there are no remaining symbols.
+      LLVM_DEBUG(dbgs() << "All symbols matched.\n");
+      IPLS->CurSearchOrderIndex = IPLS->SearchOrder.size();
+      break;
+    } else {
+      // If we get here then we've moved on to the next JITDylib with candidates
+      // remaining.
+      LLVM_DEBUG(dbgs() << "Phase 1 moving to next JITDylib.\n");
+      ++IPLS->CurSearchOrderIndex;
+      IPLS->NewJITDylib = true;
+    }
   }
 
   // Remove any weakly referenced candidates that could not be found/generated.
diff --git a/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp
index 8479495623b8..fe62138c790c 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp
@@ -154,8 +154,24 @@ public:
         }
         DebugSecInfos.push_back({&Sec, Sec.getName().substr(0, SepPos),
                                  Sec.getName().substr(SepPos + 1), 0, 0});
-      } else
+      } else {
         NonDebugSections.push_back(&Sec);
+
+        // If the first block in the section has a non-zero alignment offset
+        // then we need to add a padding block, since the section command in
+        // the header doesn't allow for aligment offsets.
+        SectionRange R(Sec);
+        if (!R.empty()) {
+          auto &FB = *R.getFirstBlock();
+          if (FB.getAlignmentOffset() != 0) {
+            auto Padding = G.allocateBuffer(FB.getAlignmentOffset());
+            memset(Padding.data(), 0, Padding.size());
+            G.createContentBlock(Sec, Padding,
+                                 FB.getAddress() - FB.getAlignmentOffset(),
+                                 FB.getAlignment(), 0);
+          }
+        }
+      }
     }
 
     // Create container block.
diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index b17d196f01b6..eded54f4bfb3 100644
--- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -28,8 +28,8 @@ class DSOHandleMaterializationUnit : public MaterializationUnit {
 public:
   DSOHandleMaterializationUnit(ELFNixPlatform &ENP,
                                const SymbolStringPtr &DSOHandleSymbol)
-      : MaterializationUnit(createDSOHandleSectionSymbols(ENP, DSOHandleSymbol),
-                            DSOHandleSymbol),
+      : MaterializationUnit(
+            createDSOHandleSectionInterface(ENP, DSOHandleSymbol)),
         ENP(ENP) {}
 
   StringRef getName() const override { return "DSOHandleMU"; }
@@ -70,12 +70,13 @@ public:
   void discard(const JITDylib &JD, const SymbolStringPtr &Sym) override {}
 
 private:
-  static SymbolFlagsMap
-  createDSOHandleSectionSymbols(ELFNixPlatform &ENP,
-                                const SymbolStringPtr &DSOHandleSymbol) {
+  static MaterializationUnit::Interface
+  createDSOHandleSectionInterface(ELFNixPlatform &ENP,
+                                  const SymbolStringPtr &DSOHandleSymbol) {
     SymbolFlagsMap SymbolFlags;
     SymbolFlags[DSOHandleSymbol] = JITSymbolFlags::Exported;
-    return SymbolFlags;
+    return MaterializationUnit::Interface(std::move(SymbolFlags),
+                                          DSOHandleSymbol);
   }
 
   ArrayRef<char> getDSOHandleContent(size_t PointerSize) {
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 2ab9ed4f856b..ae2d47fb8c5e 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/Layer.h"
+#include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -269,25 +270,30 @@ Error DynamicLibrarySearchGenerator::tryToGenerate(
 }
 
 Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
-StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName) {
+StaticLibraryDefinitionGenerator::Load(
+    ObjectLayer &L, const char *FileName,
+    GetObjectFileInterface GetObjFileInterface) {
   auto ArchiveBuffer = errorOrToExpected(MemoryBuffer::getFile(FileName));
 
   if (!ArchiveBuffer)
     return ArchiveBuffer.takeError();
 
-  return Create(L, std::move(*ArchiveBuffer));
+  return Create(L, std::move(*ArchiveBuffer), std::move(GetObjFileInterface));
 }
 
 Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
-StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName,
-                                       const Triple &TT) {
+StaticLibraryDefinitionGenerator::Load(
+    ObjectLayer &L, const char *FileName, const Triple &TT,
+    GetObjectFileInterface GetObjFileInterface) {
+
   auto B = object::createBinary(FileName);
   if (!B)
     return B.takeError();
 
   // If this is a regular archive then create an instance from it.
   if (isa<object::Archive>(B->getBinary()))
-    return Create(L, std::move(B->takeBinary().second));
+    return Create(L, std::move(B->takeBinary().second),
+                  std::move(GetObjFileInterface));
 
   // If this is a universal binary then search for a slice matching the given
   // Triple.
@@ -309,7 +315,8 @@ StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName,
                   " .. " + formatv("{0:x}", Obj.getOffset() + Obj.getSize()) +
                   ": " + SliceBuffer.getError().message(),
               SliceBuffer.getError());
-        return Create(L, std::move(*SliceBuffer));
+        return Create(L, std::move(*SliceBuffer),
+                      std::move(GetObjFileInterface));
       }
     }
 
@@ -326,11 +333,13 @@ StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName,
 
 Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
 StaticLibraryDefinitionGenerator::Create(
-    ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer) {
+    ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer,
+    GetObjectFileInterface GetObjFileInterface) {
   Error Err = Error::success();
 
   std::unique_ptr<StaticLibraryDefinitionGenerator> ADG(
-      new StaticLibraryDefinitionGenerator(L, std::move(ArchiveBuffer), Err));
+      new StaticLibraryDefinitionGenerator(
+          L, std::move(ArchiveBuffer), std::move(GetObjFileInterface), Err));
 
   if (Err)
     return std::move(Err);
@@ -371,7 +380,12 @@ Error StaticLibraryDefinitionGenerator::tryToGenerate(
     MemoryBufferRef ChildBufferRef(ChildBufferInfo.first,
                                    ChildBufferInfo.second);
 
-    if (auto Err = L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef, false)))
+    auto I = GetObjFileInterface(L.getExecutionSession(), ChildBufferRef);
+    if (!I)
+      return I.takeError();
+
+    if (auto Err = L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef, false),
+                         std::move(*I)))
       return Err;
   }
 
@@ -379,9 +393,15 @@ Error StaticLibraryDefinitionGenerator::tryToGenerate(
 }
 
 StaticLibraryDefinitionGenerator::StaticLibraryDefinitionGenerator(
-    ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer, Error &Err)
-    : L(L), ArchiveBuffer(std::move(ArchiveBuffer)),
-      Archive(std::make_unique<object::Archive>(*this->ArchiveBuffer, Err)) {}
+    ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer,
+    GetObjectFileInterface GetObjFileInterface, Error &Err)
+    : L(L), GetObjFileInterface(std::move(GetObjFileInterface)),
+      ArchiveBuffer(std::move(ArchiveBuffer)),
+      Archive(std::make_unique<object::Archive>(*this->ArchiveBuffer, Err)) {
+
+  if (!this->GetObjFileInterface)
+    this->GetObjFileInterface = getObjectFileInterface;
+}
 
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index ee1630a2ffa8..f427271bb45d 100644
--- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -31,8 +31,8 @@ public:
 
   CompileCallbackMaterializationUnit(SymbolStringPtr Name,
                                      CompileFunction Compile)
-      : MaterializationUnit(SymbolFlagsMap({{Name, JITSymbolFlags::Exported}}),
-                            nullptr),
+      : MaterializationUnit(Interface(
+            SymbolFlagsMap({{Name, JITSymbolFlags::Exported}}), nullptr)),
         Name(std::move(Name)), Compile(std::move(Compile)) {}
 
   StringRef getName() const override { return "<Compile Callbacks>"; }
diff --git a/llvm/lib/ExecutionEngine/Orc/Layer.cpp b/llvm/lib/ExecutionEngine/Orc/Layer.cpp
index 20dfba23bf10..adb8861793b1 100644
--- a/llvm/lib/ExecutionEngine/Orc/Layer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Layer.cpp
@@ -10,9 +10,8 @@
 
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/Object/MachO.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "orc"
@@ -33,7 +32,7 @@ Error IRLayer::add(ResourceTrackerSP RT, ThreadSafeModule TSM) {
 IRMaterializationUnit::IRMaterializationUnit(
     ExecutionSession &ES, const IRSymbolMapper::ManglingOptions &MO,
     ThreadSafeModule TSM)
-    : MaterializationUnit(SymbolFlagsMap(), nullptr), TSM(std::move(TSM)) {
+    : MaterializationUnit(Interface()), TSM(std::move(TSM)) {
 
   assert(this->TSM && "Module must not be null");
 
@@ -98,10 +97,10 @@ IRMaterializationUnit::IRMaterializationUnit(
 }
 
 IRMaterializationUnit::IRMaterializationUnit(
-    ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
-    SymbolStringPtr InitSymbol, SymbolNameToDefinitionMap SymbolToDefinition)
-    : MaterializationUnit(std::move(SymbolFlags), std::move(InitSymbol)),
-      TSM(std::move(TSM)), SymbolToDefinition(std::move(SymbolToDefinition)) {}
+    ThreadSafeModule TSM, Interface I,
+    SymbolNameToDefinitionMap SymbolToDefinition)
+    : MaterializationUnit(std::move(I)), TSM(std::move(TSM)),
+      SymbolToDefinition(std::move(SymbolToDefinition)) {}
 
 StringRef IRMaterializationUnit::getName() const {
   if (TSM)
@@ -161,37 +160,47 @@ ObjectLayer::ObjectLayer(ExecutionSession &ES) : ES(ES) {}
 
 ObjectLayer::~ObjectLayer() {}
 
-Error ObjectLayer::add(ResourceTrackerSP RT, std::unique_ptr<MemoryBuffer> O) {
+Error ObjectLayer::add(ResourceTrackerSP RT, std::unique_ptr<MemoryBuffer> O,
+                       MaterializationUnit::Interface I) {
   assert(RT && "RT can not be null");
-  auto ObjMU = BasicObjectLayerMaterializationUnit::Create(*this, std::move(O));
-  if (!ObjMU)
-    return ObjMU.takeError();
   auto &JD = RT->getJITDylib();
-  return JD.define(std::move(*ObjMU), std::move(RT));
+  return JD.define(std::make_unique<BasicObjectLayerMaterializationUnit>(
+                       *this, std::move(O), std::move(I)),
+                   std::move(RT));
+}
+
+Error ObjectLayer::add(ResourceTrackerSP RT, std::unique_ptr<MemoryBuffer> O) {
+  auto I = getObjectFileInterface(getExecutionSession(), O->getMemBufferRef());
+  if (!I)
+    return I.takeError();
+  return add(std::move(RT), std::move(O), std::move(*I));
+}
+
+Error ObjectLayer::add(JITDylib &JD, std::unique_ptr<MemoryBuffer> O) {
+  auto I = getObjectFileInterface(getExecutionSession(), O->getMemBufferRef());
+  if (!I)
+    return I.takeError();
+  return add(JD, std::move(O), std::move(*I));
 }
 
 Expected<std::unique_ptr<BasicObjectLayerMaterializationUnit>>
 BasicObjectLayerMaterializationUnit::Create(ObjectLayer &L,
                                             std::unique_ptr<MemoryBuffer> O) {
-  auto ObjSymInfo =
-      getObjectSymbolInfo(L.getExecutionSession(), O->getMemBufferRef());
 
-  if (!ObjSymInfo)
-    return ObjSymInfo.takeError();
+  auto ObjInterface =
+      getObjectFileInterface(L.getExecutionSession(), O->getMemBufferRef());
 
-  auto &SymbolFlags = ObjSymInfo->first;
-  auto &InitSymbol = ObjSymInfo->second;
+  if (!ObjInterface)
+    return ObjInterface.takeError();
 
   return std::unique_ptr<BasicObjectLayerMaterializationUnit>(
-      new BasicObjectLayerMaterializationUnit(
-          L, std::move(O), std::move(SymbolFlags), std::move(InitSymbol)));
+      new BasicObjectLayerMaterializationUnit(L, std::move(O),
+                                              std::move(*ObjInterface)));
 }
 
 BasicObjectLayerMaterializationUnit::BasicObjectLayerMaterializationUnit(
-    ObjectLayer &L, std::unique_ptr<MemoryBuffer> O, SymbolFlagsMap SymbolFlags,
-    SymbolStringPtr InitSymbol)
-    : MaterializationUnit(std::move(SymbolFlags), std::move(InitSymbol)), L(L),
-      O(std::move(O)) {}
+    ObjectLayer &L, std::unique_ptr<MemoryBuffer> O, Interface I)
+    : MaterializationUnit(std::move(I)), L(L), O(std::move(O)) {}
 
 StringRef BasicObjectLayerMaterializationUnit::getName() const {
   if (O)
diff --git a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
index e1f494415e86..66453e6a632f 100644
--- a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -144,7 +144,7 @@ createLocalLazyCallThroughManager(const Triple &T, ExecutionSession &ES,
 LazyReexportsMaterializationUnit::LazyReexportsMaterializationUnit(
     LazyCallThroughManager &LCTManager, IndirectStubsManager &ISManager,
     JITDylib &SourceJD, SymbolAliasMap CallableAliases, ImplSymbolMap *SrcJDLoc)
-    : MaterializationUnit(extractFlags(CallableAliases), nullptr),
+    : MaterializationUnit(extractFlags(CallableAliases)),
       LCTManager(LCTManager), ISManager(ISManager), SourceJD(SourceJD),
       CallableAliases(std::move(CallableAliases)), AliaseeTable(SrcJDLoc) {}
 
@@ -219,7 +219,7 @@ void LazyReexportsMaterializationUnit::discard(const JITDylib &JD,
   CallableAliases.erase(Name);
 }
 
-SymbolFlagsMap
+MaterializationUnit::Interface
 LazyReexportsMaterializationUnit::extractFlags(const SymbolAliasMap &Aliases) {
   SymbolFlagsMap SymbolFlags;
   for (auto &KV : Aliases) {
@@ -227,7 +227,7 @@ LazyReexportsMaterializationUnit::extractFlags(const SymbolAliasMap &Aliases) {
            "Lazy re-exports must be callable symbols");
     SymbolFlags[KV.first] = KV.second.AliasFlags;
   }
-  return SymbolFlags;
+  return MaterializationUnit::Interface(std::move(SymbolFlags), nullptr);
 }
 
 } // End namespace orc.
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 46c915dfea9e..fb2e90e1c9c5 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -28,8 +28,7 @@ class MachOHeaderMaterializationUnit : public MaterializationUnit {
 public:
   MachOHeaderMaterializationUnit(MachOPlatform &MOP,
                                  const SymbolStringPtr &HeaderStartSymbol)
-      : MaterializationUnit(createHeaderSymbols(MOP, HeaderStartSymbol),
-                            HeaderStartSymbol),
+      : MaterializationUnit(createHeaderInterface(MOP, HeaderStartSymbol)),
         MOP(MOP) {}
 
   StringRef getName() const override { return "MachOHeaderMU"; }
@@ -110,9 +109,9 @@ private:
     return G.createContentBlock(HeaderSection, HeaderContent, 0, 8, 0);
   }
 
-  static SymbolFlagsMap
-  createHeaderSymbols(MachOPlatform &MOP,
-                      const SymbolStringPtr &HeaderStartSymbol) {
+  static MaterializationUnit::Interface
+  createHeaderInterface(MachOPlatform &MOP,
+                        const SymbolStringPtr &HeaderStartSymbol) {
     SymbolFlagsMap HeaderSymbolFlags;
 
     HeaderSymbolFlags[HeaderStartSymbol] = JITSymbolFlags::Exported;
@@ -120,7 +119,8 @@ private:
       HeaderSymbolFlags[MOP.getExecutionSession().intern(HS.Name)] =
           JITSymbolFlags::Exported;
 
-    return HeaderSymbolFlags;
+    return MaterializationUnit::Interface(std::move(HeaderSymbolFlags),
+                                          HeaderStartSymbol);
   }
 
   MachOPlatform &MOP;
diff --git a/llvm/lib/ExecutionEngine/Orc/Mangling.cpp b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
index 7b21e6a684ca..9c243c9bf1d2 100644
--- a/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
@@ -7,13 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/Mangling.h"
-#include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h"
-#include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Mangler.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Object/MachO.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "orc"
@@ -85,188 +80,5 @@ void IRSymbolMapper::add(ExecutionSession &ES, const ManglingOptions &MO,
   }
 }
 
-static SymbolStringPtr addInitSymbol(SymbolFlagsMap &SymbolFlags,
-                                     ExecutionSession &ES,
-                                     StringRef ObjFileName) {
-  SymbolStringPtr InitSymbol;
-  size_t Counter = 0;
-
-  do {
-    std::string InitSymString;
-    raw_string_ostream(InitSymString)
-      << "$." << ObjFileName << ".__inits." << Counter++;
-    InitSymbol = ES.intern(InitSymString);
-  } while (SymbolFlags.count(InitSymbol));
-
-  SymbolFlags[InitSymbol] = JITSymbolFlags::MaterializationSideEffectsOnly;
-  return InitSymbol;
-}
-
-static Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>>
-getMachOObjectFileSymbolInfo(ExecutionSession &ES,
-                             const object::MachOObjectFile &Obj) {
-  SymbolFlagsMap SymbolFlags;
-
-  for (auto &Sym : Obj.symbols()) {
-    Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
-    if (!SymFlagsOrErr)
-      // TODO: Test this error.
-      return SymFlagsOrErr.takeError();
-
-    // Skip symbols not defined in this object file.
-    if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined)
-      continue;
-
-    // Skip symbols that are not global.
-    if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global))
-      continue;
-
-    // Skip symbols that have type SF_File.
-    if (auto SymType = Sym.getType()) {
-      if (*SymType == object::SymbolRef::ST_File)
-        continue;
-    } else
-      return SymType.takeError();
-
-    auto Name = Sym.getName();
-    if (!Name)
-      return Name.takeError();
-    auto InternedName = ES.intern(*Name);
-    auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
-    if (!SymFlags)
-      return SymFlags.takeError();
-
-    // Strip the 'exported' flag from MachO linker-private symbols.
-    if (Name->startswith("l"))
-      *SymFlags &= ~JITSymbolFlags::Exported;
-
-    SymbolFlags[InternedName] = std::move(*SymFlags);
-  }
-
-  SymbolStringPtr InitSymbol;
-  for (auto &Sec : Obj.sections()) {
-    auto SecType = Obj.getSectionType(Sec);
-    if ((SecType & MachO::SECTION_TYPE) == MachO::S_MOD_INIT_FUNC_POINTERS) {
-      InitSymbol = addInitSymbol(SymbolFlags, ES, Obj.getFileName());
-      break;
-    }
-    auto SegName = Obj.getSectionFinalSegmentName(Sec.getRawDataRefImpl());
-    auto SecName = cantFail(Obj.getSectionName(Sec.getRawDataRefImpl()));
-    if (MachOPlatform::isInitializerSection(SegName, SecName)) {
-      InitSymbol = addInitSymbol(SymbolFlags, ES, Obj.getFileName());
-      break;
-    }
-  }
-
-  return std::make_pair(std::move(SymbolFlags), std::move(InitSymbol));
-}
-
-static Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>>
-getELFObjectFileSymbolInfo(ExecutionSession &ES,
-                           const object::ELFObjectFileBase &Obj) {
-  SymbolFlagsMap SymbolFlags;
-  for (auto &Sym : Obj.symbols()) {
-    Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
-    if (!SymFlagsOrErr)
-      // TODO: Test this error.
-      return SymFlagsOrErr.takeError();
-
-    // Skip symbols not defined in this object file.
-    if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined)
-      continue;
-
-    // Skip symbols that are not global.
-    if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global))
-      continue;
-
-    // Skip symbols that have type SF_File.
-    if (auto SymType = Sym.getType()) {
-      if (*SymType == object::SymbolRef::ST_File)
-        continue;
-    } else
-      return SymType.takeError();
-
-    auto Name = Sym.getName();
-    if (!Name)
-      return Name.takeError();
-    auto InternedName = ES.intern(*Name);
-    auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
-    if (!SymFlags)
-      return SymFlags.takeError();
-
-    // ELF STB_GNU_UNIQUE should map to Weak for ORC.
-    if (Sym.getBinding() == ELF::STB_GNU_UNIQUE)
-      *SymFlags |= JITSymbolFlags::Weak;
-
-    SymbolFlags[InternedName] = std::move(*SymFlags);
-  }
-
-  SymbolStringPtr InitSymbol;
-  for (auto &Sec : Obj.sections()) {
-    if (auto SecName = Sec.getName()) {
-      if (ELFNixPlatform::isInitializerSection(*SecName)) {
-        InitSymbol = addInitSymbol(SymbolFlags, ES, Obj.getFileName());
-        break;
-      }
-    }
-  }
-
-  return std::make_pair(std::move(SymbolFlags), InitSymbol);
-}
-
-Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>>
-getGenericObjectFileSymbolInfo(ExecutionSession &ES,
-                               const object::ObjectFile &Obj) {
-  SymbolFlagsMap SymbolFlags;
-  for (auto &Sym : Obj.symbols()) {
-    Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
-    if (!SymFlagsOrErr)
-      // TODO: Test this error.
-      return SymFlagsOrErr.takeError();
-
-    // Skip symbols not defined in this object file.
-    if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined)
-      continue;
-
-    // Skip symbols that are not global.
-    if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global))
-      continue;
-
-    // Skip symbols that have type SF_File.
-    if (auto SymType = Sym.getType()) {
-      if (*SymType == object::SymbolRef::ST_File)
-        continue;
-    } else
-      return SymType.takeError();
-
-    auto Name = Sym.getName();
-    if (!Name)
-      return Name.takeError();
-    auto InternedName = ES.intern(*Name);
-    auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
-    if (!SymFlags)
-      return SymFlags.takeError();
-
-    SymbolFlags[InternedName] = std::move(*SymFlags);
-  }
-
-  return std::make_pair(std::move(SymbolFlags), nullptr);
-}
-
-Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>>
-getObjectSymbolInfo(ExecutionSession &ES, MemoryBufferRef ObjBuffer) {
-  auto Obj = object::ObjectFile::createObjectFile(ObjBuffer);
-
-  if (!Obj)
-    return Obj.takeError();
-
-  if (auto *MachOObj = dyn_cast<object::MachOObjectFile>(Obj->get()))
-    return getMachOObjectFileSymbolInfo(ES, *MachOObj);
-  else if (auto *ELFObj = dyn_cast<object::ELFObjectFileBase>(Obj->get()))
-    return getELFObjectFileSymbolInfo(ES, *ELFObj);
-
-  return getGenericObjectFileSymbolInfo(ES, **Obj);
-}
-
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp
new file mode 100644
index 000000000000..c1ad569dd65d
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp
@@ -0,0 +1,205 @@
+//===------ ObjectFileInterface.cpp - MU interface utils for objects ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
+#include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h"
+#include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "orc"
+
+namespace llvm {
+namespace orc {
+
+void addInitSymbol(MaterializationUnit::Interface &I, ExecutionSession &ES,
+                   StringRef ObjFileName) {
+  assert(!I.InitSymbol && "I already has an init symbol");
+  size_t Counter = 0;
+
+  do {
+    std::string InitSymString;
+    raw_string_ostream(InitSymString)
+        << "$." << ObjFileName << ".__inits." << Counter++;
+    I.InitSymbol = ES.intern(InitSymString);
+  } while (I.SymbolFlags.count(I.InitSymbol));
+
+  I.SymbolFlags[I.InitSymbol] = JITSymbolFlags::MaterializationSideEffectsOnly;
+}
+
+static Expected<MaterializationUnit::Interface>
+getMachOObjectFileSymbolInfo(ExecutionSession &ES,
+                             const object::MachOObjectFile &Obj) {
+  MaterializationUnit::Interface I;
+
+  for (auto &Sym : Obj.symbols()) {
+    Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
+    if (!SymFlagsOrErr)
+      // TODO: Test this error.
+      return SymFlagsOrErr.takeError();
+
+    // Skip symbols not defined in this object file.
+    if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined)
+      continue;
+
+    // Skip symbols that are not global.
+    if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global))
+      continue;
+
+    // Skip symbols that have type SF_File.
+    if (auto SymType = Sym.getType()) {
+      if (*SymType == object::SymbolRef::ST_File)
+        continue;
+    } else
+      return SymType.takeError();
+
+    auto Name = Sym.getName();
+    if (!Name)
+      return Name.takeError();
+    auto InternedName = ES.intern(*Name);
+    auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
+    if (!SymFlags)
+      return SymFlags.takeError();
+
+    // Strip the 'exported' flag from MachO linker-private symbols.
+    if (Name->startswith("l"))
+      *SymFlags &= ~JITSymbolFlags::Exported;
+
+    I.SymbolFlags[InternedName] = std::move(*SymFlags);
+  }
+
+  for (auto &Sec : Obj.sections()) {
+    auto SecType = Obj.getSectionType(Sec);
+    if ((SecType & MachO::SECTION_TYPE) == MachO::S_MOD_INIT_FUNC_POINTERS) {
+      addInitSymbol(I, ES, Obj.getFileName());
+      break;
+    }
+    auto SegName = Obj.getSectionFinalSegmentName(Sec.getRawDataRefImpl());
+    auto SecName = cantFail(Obj.getSectionName(Sec.getRawDataRefImpl()));
+    if (MachOPlatform::isInitializerSection(SegName, SecName)) {
+      addInitSymbol(I, ES, Obj.getFileName());
+      break;
+    }
+  }
+
+  return I;
+}
+
+static Expected<MaterializationUnit::Interface>
+getELFObjectFileSymbolInfo(ExecutionSession &ES,
+                           const object::ELFObjectFileBase &Obj) {
+  MaterializationUnit::Interface I;
+
+  for (auto &Sym : Obj.symbols()) {
+    Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
+    if (!SymFlagsOrErr)
+      // TODO: Test this error.
+      return SymFlagsOrErr.takeError();
+
+    // Skip symbols not defined in this object file.
+    if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined)
+      continue;
+
+    // Skip symbols that are not global.
+    if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global))
+      continue;
+
+    // Skip symbols that have type SF_File.
+    if (auto SymType = Sym.getType()) {
+      if (*SymType == object::SymbolRef::ST_File)
+        continue;
+    } else
+      return SymType.takeError();
+
+    auto Name = Sym.getName();
+    if (!Name)
+      return Name.takeError();
+    auto InternedName = ES.intern(*Name);
+    auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
+    if (!SymFlags)
+      return SymFlags.takeError();
+
+    // ELF STB_GNU_UNIQUE should map to Weak for ORC.
+    if (Sym.getBinding() == ELF::STB_GNU_UNIQUE)
+      *SymFlags |= JITSymbolFlags::Weak;
+
+    I.SymbolFlags[InternedName] = std::move(*SymFlags);
+  }
+
+  SymbolStringPtr InitSymbol;
+  for (auto &Sec : Obj.sections()) {
+    if (auto SecName = Sec.getName()) {
+      if (ELFNixPlatform::isInitializerSection(*SecName)) {
+        addInitSymbol(I, ES, Obj.getFileName());
+        break;
+      }
+    }
+  }
+
+  return I;
+}
+
+Expected<MaterializationUnit::Interface>
+getGenericObjectFileSymbolInfo(ExecutionSession &ES,
+                               const object::ObjectFile &Obj) {
+  MaterializationUnit::Interface I;
+
+  for (auto &Sym : Obj.symbols()) {
+    Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
+    if (!SymFlagsOrErr)
+      // TODO: Test this error.
+      return SymFlagsOrErr.takeError();
+
+    // Skip symbols not defined in this object file.
+    if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined)
+      continue;
+
+    // Skip symbols that are not global.
+    if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global))
+      continue;
+
+    // Skip symbols that have type SF_File.
+    if (auto SymType = Sym.getType()) {
+      if (*SymType == object::SymbolRef::ST_File)
+        continue;
+    } else
+      return SymType.takeError();
+
+    auto Name = Sym.getName();
+    if (!Name)
+      return Name.takeError();
+    auto InternedName = ES.intern(*Name);
+    auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
+    if (!SymFlags)
+      return SymFlags.takeError();
+
+    I.SymbolFlags[InternedName] = std::move(*SymFlags);
+  }
+
+  return I;
+}
+
+Expected<MaterializationUnit::Interface>
+getObjectFileInterface(ExecutionSession &ES, MemoryBufferRef ObjBuffer) {
+  auto Obj = object::ObjectFile::createObjectFile(ObjBuffer);
+
+  if (!Obj)
+    return Obj.takeError();
+
+  if (auto *MachOObj = dyn_cast<object::MachOObjectFile>(Obj->get()))
+    return getMachOObjectFileSymbolInfo(ES, *MachOObj);
+  else if (auto *ELFObj = dyn_cast<object::ELFObjectFileBase>(Obj->get()))
+    return getELFObjectFileSymbolInfo(ES, *ELFObj);
+
+  return getGenericObjectFileSymbolInfo(ES, **Obj);
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 6f840a079dd1..0d6a33c5685e 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -23,12 +23,6 @@ using namespace llvm::orc;
 namespace {
 
 class LinkGraphMaterializationUnit : public MaterializationUnit {
-private:
-  struct LinkGraphInterface {
-    SymbolFlagsMap SymbolFlags;
-    SymbolStringPtr InitSymbol;
-  };
-
 public:
   static std::unique_ptr<LinkGraphMaterializationUnit>
   Create(ObjectLinkingLayer &ObjLinkingLayer, std::unique_ptr<LinkGraph> G) {
@@ -44,9 +38,9 @@ public:
   }
 
 private:
-  static LinkGraphInterface scanLinkGraph(ExecutionSession &ES, LinkGraph &G) {
+  static Interface scanLinkGraph(ExecutionSession &ES, LinkGraph &G) {
 
-    LinkGraphInterface LGI;
+    Interface LGI;
 
     for (auto *Sym : G.defined_symbols()) {
       // Skip local symbols.
@@ -98,11 +92,9 @@ private:
   }
 
   LinkGraphMaterializationUnit(ObjectLinkingLayer &ObjLinkingLayer,
-                               std::unique_ptr<LinkGraph> G,
-                               LinkGraphInterface LGI)
-      : MaterializationUnit(std::move(LGI.SymbolFlags),
-                            std::move(LGI.InitSymbol)),
-        ObjLinkingLayer(ObjLinkingLayer), G(std::move(G)) {}
+                               std::unique_ptr<LinkGraph> G, Interface LGI)
+      : MaterializationUnit(std::move(LGI)), ObjLinkingLayer(ObjLinkingLayer),
+        G(std::move(G)) {}
 
   void discard(const JITDylib &JD, const SymbolStringPtr &Name) override {
     for (auto *Sym : G->defined_symbols())
@@ -257,7 +249,8 @@ public:
 
     {
 
-      // Check that InternedResult matches up with MR->getSymbols().
+      // Check that InternedResult matches up with MR->getSymbols(), overriding
+      // flags if requested.
       // This guards against faulty transformations / compilers / object caches.
 
       // First check that there aren't any missing symbols.
@@ -266,16 +259,20 @@ public:
       SymbolNameVector MissingSymbols;
       for (auto &KV : MR->getSymbols()) {
 
+        auto I = InternedResult.find(KV.first);
+
         // If this is a materialization-side-effects only symbol then bump
         // the counter and make sure it's *not* defined, otherwise make
         // sure that it is defined.
         if (KV.second.hasMaterializationSideEffectsOnly()) {
           ++NumMaterializationSideEffectsOnlySymbols;
-          if (InternedResult.count(KV.first))
+          if (I != InternedResult.end())
             ExtraSymbols.push_back(KV.first);
           continue;
-        } else if (!InternedResult.count(KV.first))
+        } else if (I == InternedResult.end())
           MissingSymbols.push_back(KV.first);
+        else if (Layer.OverrideObjectFlags)
+          I->second.setFlags(KV.second);
       }
 
       // If there were missing symbols then report the error.
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
index 673f7394450f..77a8f5af8ba0 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
@@ -192,8 +192,8 @@ public:
       LLVMOrcMaterializationUnitMaterializeFunction Materialize,
       LLVMOrcMaterializationUnitDiscardFunction Discard,
       LLVMOrcMaterializationUnitDestroyFunction Destroy)
-      : llvm::orc::MaterializationUnit(std::move(InitialSymbolFlags),
-                                       std::move(InitSymbol)),
+      : llvm::orc::MaterializationUnit(
+            Interface(std::move(InitialSymbolFlags), std::move(InitSymbol))),
         Name(std::move(Name)), Ctx(Ctx), Materialize(Materialize),
         Discard(Discard), Destroy(Destroy) {}
 
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index f16c6bdbfa4f..3f38d26869d4 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -124,8 +124,10 @@ void RuntimeDyldImpl::resolveRelocations() {
   std::lock_guard<sys::Mutex> locked(lock);
 
   // Print out the sections prior to relocation.
-  LLVM_DEBUG(for (int i = 0, e = Sections.size(); i != e; ++i)
-                 dumpSectionMemory(Sections[i], "before relocations"););
+  LLVM_DEBUG({
+    for (SectionEntry &S : Sections)
+      dumpSectionMemory(S, "before relocations");
+  });
 
   // First, resolve relocations associated with external symbols.
   if (auto Err = resolveExternalSymbols()) {
@@ -136,21 +138,23 @@ void RuntimeDyldImpl::resolveRelocations() {
   resolveLocalRelocations();
 
   // Print out sections after relocation.
-  LLVM_DEBUG(for (int i = 0, e = Sections.size(); i != e; ++i)
-                 dumpSectionMemory(Sections[i], "after relocations"););
+  LLVM_DEBUG({
+    for (SectionEntry &S : Sections)
+      dumpSectionMemory(S, "after relocations");
+  });
 }
 
 void RuntimeDyldImpl::resolveLocalRelocations() {
   // Iterate over all outstanding relocations
-  for (auto it = Relocations.begin(), e = Relocations.end(); it != e; ++it) {
+  for (const auto &Rel : Relocations) {
     // The Section here (Sections[i]) refers to the section in which the
     // symbol for the relocation is located.  The SectionID in the relocation
     // entry provides the section to which the relocation will be applied.
-    unsigned Idx = it->first;
+    unsigned Idx = Rel.first;
     uint64_t Addr = getSectionLoadAddress(Idx);
     LLVM_DEBUG(dbgs() << "Resolving relocations Section #" << Idx << "\t"
                       << format("%p", (uintptr_t)Addr) << "\n");
-    resolveRelocationList(it->second, Addr);
+    resolveRelocationList(Rel.second, Addr);
   }
   Relocations.clear();
 }
@@ -457,9 +461,9 @@ static uint64_t
 computeAllocationSizeForSections(std::vector<uint64_t> &SectionSizes,
                                  uint64_t Alignment) {
   uint64_t TotalSize = 0;
-  for (size_t Idx = 0, Cnt = SectionSizes.size(); Idx < Cnt; Idx++) {
+  for (uint64_t SectionSize : SectionSizes) {
     uint64_t AlignedSize =
-        (SectionSizes[Idx] + Alignment - 1) / Alignment * Alignment;
+        (SectionSize + Alignment - 1) / Alignment * Alignment;
     TotalSize += AlignedSize;
   }
   return TotalSize;
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 18f1a2314853..5157d51fd18c 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -996,7 +996,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSections(
   Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
   AllocaIP = Builder.saveIP();
   InsertPointTy AfterIP =
-      applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, true);
+      applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
   BasicBlock *LoopAfterBB = AfterIP.getBlock();
   Instruction *SplitPos = LoopAfterBB->getTerminator();
   if (!isa_and_nonnull<BranchInst>(SplitPos))
@@ -1156,7 +1156,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   Builder.SetInsertPoint(NonAtomicRedBlock);
   for (auto En : enumerate(ReductionInfos)) {
     const ReductionInfo &RI = En.value();
-    Type *ValueType = RI.getElementType();
+    Type *ValueType = RI.ElementType;
     Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable,
                                          "red.value." + Twine(En.index()));
     Value *PrivateRedValue =
@@ -1181,8 +1181,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   Builder.SetInsertPoint(AtomicRedBlock);
   if (CanGenerateAtomic) {
     for (const ReductionInfo &RI : ReductionInfos) {
-      Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.Variable,
-                                              RI.PrivateVariable));
+      Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.ElementType,
+                                              RI.Variable, RI.PrivateVariable));
       if (!Builder.GetInsertBlock())
         return InsertPointTy();
     }
@@ -1207,13 +1207,13 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
         RedArrayTy, LHSArrayPtr, 0, En.index());
     Value *LHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), LHSI8PtrPtr);
     Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
-    Value *LHS = Builder.CreateLoad(RI.getElementType(), LHSPtr);
+    Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
     Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
         RedArrayTy, RHSArrayPtr, 0, En.index());
     Value *RHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), RHSI8PtrPtr);
     Value *RHSPtr =
         Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType());
-    Value *RHS = Builder.CreateLoad(RI.getElementType(), RHSPtr);
+    Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
     Value *Reduced;
     Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
     if (!Builder.GetInsertBlock())
@@ -1329,13 +1329,10 @@ CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
   LoopInfos.emplace_front();
   CanonicalLoopInfo *CL = &LoopInfos.front();
 
-  CL->Preheader = Preheader;
   CL->Header = Header;
   CL->Cond = Cond;
-  CL->Body = Body;
   CL->Latch = Latch;
   CL->Exit = Exit;
-  CL->After = After;
 
 #ifndef NDEBUG
   CL->assertOK();
@@ -1359,7 +1356,7 @@ OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
     // Split the loop at the insertion point: Branch to the preheader and move
     // every following instruction to after the loop (the After BB). Also, the
     // new successor is the loop's after block.
-    Builder.CreateBr(CL->Preheader);
+    Builder.CreateBr(CL->getPreheader());
     After->getInstList().splice(After->begin(), BB->getInstList(),
                                 Builder.GetInsertPoint(), BB->end());
     After->replaceSuccessorsPhiUsesWith(BB, After);
@@ -1791,6 +1788,12 @@ OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
   BasicBlock *OrigAfter = Outermost->getAfter();
   Function *F = OrigPreheader->getParent();
 
+  // Loop control blocks that may become orphaned later.
+  SmallVector<BasicBlock *, 12> OldControlBBs;
+  OldControlBBs.reserve(6 * Loops.size());
+  for (CanonicalLoopInfo *Loop : Loops)
+    Loop->collectControlBlocks(OldControlBBs);
+
   // Setup the IRBuilder for inserting the trip count computation.
   Builder.SetCurrentDebugLocation(DL);
   if (ComputeIP.isSet())
@@ -1828,7 +1831,7 @@ OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
 
   Value *Leftover = Result->getIndVar();
   SmallVector<Value *> NewIndVars;
-  NewIndVars.set_size(NumLoops);
+  NewIndVars.resize(NumLoops);
   for (int i = NumLoops - 1; i >= 1; --i) {
     Value *OrigTripCount = Loops[i]->getTripCount();
 
@@ -1886,10 +1889,6 @@ OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
     Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
 
   // Remove unused parts of the input loops.
-  SmallVector<BasicBlock *, 12> OldControlBBs;
-  OldControlBBs.reserve(6 * Loops.size());
-  for (CanonicalLoopInfo *Loop : Loops)
-    Loop->collectControlBlocks(OldControlBBs);
   removeUnusedBlocksFromParent(OldControlBBs);
 
   for (CanonicalLoopInfo *L : Loops)
@@ -1915,6 +1914,12 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
   BasicBlock *InnerEnter = InnermostLoop->getBody();
   BasicBlock *InnerLatch = InnermostLoop->getLatch();
 
+  // Loop control blocks that may become orphaned later.
+  SmallVector<BasicBlock *, 12> OldControlBBs;
+  OldControlBBs.reserve(6 * Loops.size());
+  for (CanonicalLoopInfo *Loop : Loops)
+    Loop->collectControlBlocks(OldControlBBs);
+
   // Collect original trip counts and induction variable to be accessible by
   // index. Also, the structure of the original loops is not preserved during
   // the construction of the tiled loops, so do it before we scavenge the BBs of
@@ -2074,10 +2079,6 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
   }
 
   // Remove unused parts of the original loops.
-  SmallVector<BasicBlock *, 12> OldControlBBs;
-  OldControlBBs.reserve(6 * Loops.size());
-  for (CanonicalLoopInfo *Loop : Loops)
-    Loop->collectControlBlocks(OldControlBBs);
   removeUnusedBlocksFromParent(OldControlBBs);
 
   for (CanonicalLoopInfo *L : Loops)
@@ -3079,7 +3080,7 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicUpdate(
     const LocationDescription &Loc, Instruction *AllocIP, AtomicOpValue &X,
     Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
-    AtomicUpdateCallbackTy &UpdateOp, bool IsXLHSInRHSPart) {
+    AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
   if (!updateToLocation(Loc))
     return Loc.IP;
 
@@ -3097,7 +3098,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicUpdate(
   });
 
   emitAtomicUpdate(AllocIP, X.Var, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
-                   IsXLHSInRHSPart);
+                   IsXBinopExpr);
   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
   return Builder.saveIP();
 }
@@ -3134,13 +3135,13 @@ std::pair<Value *, Value *>
 OpenMPIRBuilder::emitAtomicUpdate(Instruction *AllocIP, Value *X, Value *Expr,
                                   AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
                                   AtomicUpdateCallbackTy &UpdateOp,
-                                  bool VolatileX, bool IsXLHSInRHSPart) {
+                                  bool VolatileX, bool IsXBinopExpr) {
   Type *XElemTy = X->getType()->getPointerElementType();
 
   bool DoCmpExch =
       ((RMWOp == AtomicRMWInst::BAD_BINOP) || (RMWOp == AtomicRMWInst::FAdd)) ||
       (RMWOp == AtomicRMWInst::FSub) ||
-      (RMWOp == AtomicRMWInst::Sub && !IsXLHSInRHSPart);
+      (RMWOp == AtomicRMWInst::Sub && !IsXBinopExpr);
 
   std::pair<Value *, Value *> Res;
   if (XElemTy->isIntegerTy() && !DoCmpExch) {
@@ -3232,7 +3233,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCapture(
     const LocationDescription &Loc, Instruction *AllocIP, AtomicOpValue &X,
     AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
     AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
-    bool UpdateExpr, bool IsPostfixUpdate, bool IsXLHSInRHSPart) {
+    bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
   if (!updateToLocation(Loc))
     return Loc.IP;
 
@@ -3251,9 +3252,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCapture(
   // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
   // 'x' is simply atomically rewritten with 'expr'.
   AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
-  std::pair<Value *, Value *> Result =
-      emitAtomicUpdate(AllocIP, X.Var, Expr, AO, AtomicOp, UpdateOp,
-                       X.IsVolatile, IsXLHSInRHSPart);
+  std::pair<Value *, Value *> Result = emitAtomicUpdate(
+      AllocIP, X.Var, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile, IsXBinopExpr);
 
   Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second);
   Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
@@ -3321,7 +3321,16 @@ void CanonicalLoopInfo::collectControlBlocks(
   // flow. For consistency, this also means we do not add the Body block, which
   // is just the entry to the body code.
   BBs.reserve(BBs.size() + 6);
-  BBs.append({Preheader, Header, Cond, Latch, Exit, After});
+  BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
+}
+
+BasicBlock *CanonicalLoopInfo::getPreheader() const {
+  assert(isValid() && "Requires a valid canonical loop");
+  for (BasicBlock *Pred : predecessors(Header)) {
+    if (Pred != Latch)
+      return Pred;
+  }
+  llvm_unreachable("Missing preheader");
 }
 
 void CanonicalLoopInfo::assertOK() const {
@@ -3330,6 +3339,10 @@ void CanonicalLoopInfo::assertOK() const {
   if (!isValid())
     return;
 
+  BasicBlock *Preheader = getPreheader();
+  BasicBlock *Body = getBody();
+  BasicBlock *After = getAfter();
+
   // Verify standard control-flow we use for OpenMP loops.
   assert(Preheader);
   assert(isa<BranchInst>(Preheader->getTerminator()) &&
@@ -3415,11 +3428,8 @@ void CanonicalLoopInfo::assertOK() const {
 }
 
 void CanonicalLoopInfo::invalidate() {
-  Preheader = nullptr;
   Header = nullptr;
   Cond = nullptr;
-  Body = nullptr;
   Latch = nullptr;
   Exit = nullptr;
-  After = nullptr;
 }
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index c9748e1387eb..bbe0c97e60a2 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -512,10 +512,8 @@ void TypePrinting::incorporateTypes() {
   // the unnamed ones out to a numbering and remove the anonymous structs.
   unsigned NextNumber = 0;
 
-  std::vector<StructType*>::iterator NextToUse = NamedTypes.begin(), I, E;
-  for (I = NamedTypes.begin(), E = NamedTypes.end(); I != E; ++I) {
-    StructType *STy = *I;
-
+  std::vector<StructType *>::iterator NextToUse = NamedTypes.begin();
+  for (StructType *STy : NamedTypes) {
     // Ignore anonymous types.
     if (STy->isLiteral())
       continue;
@@ -1450,6 +1448,12 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
+  if (const auto *NC = dyn_cast<NoCFIValue>(CV)) {
+    Out << "no_cfi ";
+    WriteAsOperandInternal(Out, NC->getGlobalValue(), WriterCtx);
+    return;
+  }
+
   if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
     Type *ETy = CA->getType()->getElementType();
     Out << '[';
@@ -1583,11 +1587,9 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
         Out << ", ";
     }
 
-    if (CE->hasIndices()) {
-      ArrayRef<unsigned> Indices = CE->getIndices();
-      for (unsigned i = 0, e = Indices.size(); i != e; ++i)
-        Out << ", " << Indices[i];
-    }
+    if (CE->hasIndices())
+      for (unsigned I : CE->getIndices())
+        Out << ", " << I;
 
     if (CE->isCast()) {
       Out << " to ";
@@ -3528,8 +3530,8 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   }
 
   maybePrintComdat(Out, *GV);
-  if (GV->getAlignment())
-    Out << ", align " << GV->getAlignment();
+  if (MaybeAlign A = GV->getAlign())
+    Out << ", align " << A->value();
 
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
   GV->getAllMetadata(MDs);
@@ -3637,13 +3639,13 @@ void AssemblyWriter::printTypeIdentities() {
   }
 
   auto &NamedTypes = TypePrinter.getNamedTypes();
-  for (unsigned I = 0, E = NamedTypes.size(); I != E; ++I) {
-    PrintLLVMName(Out, NamedTypes[I]->getName(), LocalPrefix);
+  for (StructType *NamedType : NamedTypes) {
+    PrintLLVMName(Out, NamedType->getName(), LocalPrefix);
     Out << " = type ";
 
     // Make sure we print out at least one level of the type structure, so
     // that we do not get %FILE = type %FILE
-    TypePrinter.printStructBody(NamedTypes[I], Out);
+    TypePrinter.printStructBody(NamedType, Out);
     Out << '\n';
   }
 }
@@ -3757,8 +3759,8 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << '"';
   }
   maybePrintComdat(Out, *F);
-  if (F->getAlignment())
-    Out << " align " << F->getAlignment();
+  if (MaybeAlign A = F->getAlign())
+    Out << " align " << A->value();
   if (F->hasGC())
     Out << " gc \"" << F->getGC() << '"';
   if (F->hasPrefixData()) {
@@ -4239,8 +4241,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << ", ";
       writeOperand(AI->getArraySize(), true);
     }
-    if (AI->getAlignment()) {
-      Out << ", align " << AI->getAlignment();
+    if (MaybeAlign A = AI->getAlign()) {
+      Out << ", align " << A->value();
     }
 
     unsigned AddrSpace = AI->getType()->getAddressSpace();
@@ -4310,13 +4312,13 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
   if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     if (LI->isAtomic())
       writeAtomic(LI->getContext(), LI->getOrdering(), LI->getSyncScopeID());
-    if (LI->getAlignment())
-      Out << ", align " << LI->getAlignment();
+    if (MaybeAlign A = LI->getAlign())
+      Out << ", align " << A->value();
   } else if (const StoreInst *SI = dyn_cast<StoreInst>(&I)) {
     if (SI->isAtomic())
       writeAtomic(SI->getContext(), SI->getOrdering(), SI->getSyncScopeID());
-    if (SI->getAlignment())
-      Out << ", align " << SI->getAlignment();
+    if (MaybeAlign A = SI->getAlign())
+      Out << ", align " << A->value();
   } else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&I)) {
     writeAtomicCmpXchg(CXI->getContext(), CXI->getSuccessOrdering(),
                        CXI->getFailureOrdering(), CXI->getSyncScopeID());
diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h
index c5bbe6571096..1153fb827b56 100644
--- a/llvm/lib/IR/AttributeImpl.h
+++ b/llvm/lib/IR/AttributeImpl.h
@@ -253,7 +253,8 @@ public:
   uint64_t getDereferenceableBytes() const;
   uint64_t getDereferenceableOrNullBytes() const;
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
-  std::pair<unsigned, unsigned> getVScaleRangeArgs() const;
+  unsigned getVScaleRangeMin() const;
+  Optional<unsigned> getVScaleRangeMax() const;
   std::string getAsString(bool InAttrGrp) const;
   Type *getAttributeType(Attribute::AttrKind Kind) const;
 
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index f81a446d6e46..c899afae6cce 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -78,15 +78,18 @@ unpackAllocSizeArgs(uint64_t Num) {
   return std::make_pair(ElemSizeArg, NumElemsArg);
 }
 
-static uint64_t packVScaleRangeArgs(unsigned MinValue, unsigned MaxValue) {
-  return uint64_t(MinValue) << 32 | MaxValue;
+static uint64_t packVScaleRangeArgs(unsigned MinValue,
+                                    Optional<unsigned> MaxValue) {
+  return uint64_t(MinValue) << 32 | MaxValue.getValueOr(0);
 }
 
-static std::pair<unsigned, unsigned> unpackVScaleRangeArgs(uint64_t Value) {
+static std::pair<unsigned, Optional<unsigned>>
+unpackVScaleRangeArgs(uint64_t Value) {
   unsigned MaxValue = Value & std::numeric_limits<unsigned>::max();
   unsigned MinValue = Value >> 32;
 
-  return std::make_pair(MinValue, MaxValue);
+  return std::make_pair(MinValue,
+                        MaxValue > 0 ? MaxValue : Optional<unsigned>());
 }
 
 Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
@@ -354,10 +357,16 @@ std::pair<unsigned, Optional<unsigned>> Attribute::getAllocSizeArgs() const {
   return unpackAllocSizeArgs(pImpl->getValueAsInt());
 }
 
-std::pair<unsigned, unsigned> Attribute::getVScaleRangeArgs() const {
+unsigned Attribute::getVScaleRangeMin() const {
+  assert(hasAttribute(Attribute::VScaleRange) &&
+         "Trying to get vscale args from non-vscale attribute");
+  return unpackVScaleRangeArgs(pImpl->getValueAsInt()).first;
+}
+
+Optional<unsigned> Attribute::getVScaleRangeMax() const {
   assert(hasAttribute(Attribute::VScaleRange) &&
          "Trying to get vscale args from non-vscale attribute");
-  return unpackVScaleRangeArgs(pImpl->getValueAsInt());
+  return unpackVScaleRangeArgs(pImpl->getValueAsInt()).second;
 }
 
 std::string Attribute::getAsString(bool InAttrGrp) const {
@@ -428,13 +437,13 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
   }
 
   if (hasAttribute(Attribute::VScaleRange)) {
-    unsigned MinValue, MaxValue;
-    std::tie(MinValue, MaxValue) = getVScaleRangeArgs();
+    unsigned MinValue = getVScaleRangeMin();
+    Optional<unsigned> MaxValue = getVScaleRangeMax();
 
     std::string Result = "vscale_range(";
     Result += utostr(MinValue);
     Result += ',';
-    Result += utostr(MaxValue);
+    Result += utostr(MaxValue.getValueOr(0));
     Result += ')';
     return Result;
   }
@@ -717,9 +726,12 @@ std::pair<unsigned, Optional<unsigned>> AttributeSet::getAllocSizeArgs() const {
                  : std::pair<unsigned, Optional<unsigned>>(0, 0);
 }
 
-std::pair<unsigned, unsigned> AttributeSet::getVScaleRangeArgs() const {
-  return SetNode ? SetNode->getVScaleRangeArgs()
-                 : std::pair<unsigned, unsigned>(0, 0);
+unsigned AttributeSet::getVScaleRangeMin() const {
+  return SetNode ? SetNode->getVScaleRangeMin() : 1;
+}
+
+Optional<unsigned> AttributeSet::getVScaleRangeMax() const {
+  return SetNode ? SetNode->getVScaleRangeMax() : None;
 }
 
 std::string AttributeSet::getAsString(bool InAttrGrp) const {
@@ -897,10 +909,16 @@ AttributeSetNode::getAllocSizeArgs() const {
   return std::make_pair(0, 0);
 }
 
-std::pair<unsigned, unsigned> AttributeSetNode::getVScaleRangeArgs() const {
+unsigned AttributeSetNode::getVScaleRangeMin() const {
   if (auto A = findEnumAttribute(Attribute::VScaleRange))
-    return A->getVScaleRangeArgs();
-  return std::make_pair(0, 0);
+    return A->getVScaleRangeMin();
+  return 1;
+}
+
+Optional<unsigned> AttributeSetNode::getVScaleRangeMax() const {
+  if (auto A = findEnumAttribute(Attribute::VScaleRange))
+    return A->getVScaleRangeMax();
+  return None;
 }
 
 std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
@@ -1118,16 +1136,21 @@ AttributeList AttributeList::get(LLVMContext &C, AttributeSet FnAttrs,
 }
 
 AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
-                                 const AttrBuilder &B) {
-  if (!B.hasAttributes())
+                                 AttributeSet Attrs) {
+  if (!Attrs.hasAttributes())
     return {};
   Index = attrIdxToArrayIdx(Index);
   SmallVector<AttributeSet, 8> AttrSets(Index + 1);
-  AttrSets[Index] = AttributeSet::get(C, B);
+  AttrSets[Index] = Attrs;
   return getImpl(C, AttrSets);
 }
 
 AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 const AttrBuilder &B) {
+  return get(C, Index, AttributeSet::get(C, B));
+}
+
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
                                  ArrayRef<Attribute::AttrKind> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
   for (const auto K : Kinds)
@@ -1623,8 +1646,12 @@ std::pair<unsigned, Optional<unsigned>> AttrBuilder::getAllocSizeArgs() const {
   return unpackAllocSizeArgs(getRawIntAttr(Attribute::AllocSize));
 }
 
-std::pair<unsigned, unsigned> AttrBuilder::getVScaleRangeArgs() const {
-  return unpackVScaleRangeArgs(getRawIntAttr(Attribute::VScaleRange));
+unsigned AttrBuilder::getVScaleRangeMin() const {
+  return unpackVScaleRangeArgs(getRawIntAttr(Attribute::VScaleRange)).first;
+}
+
+Optional<unsigned> AttrBuilder::getVScaleRangeMax() const {
+  return unpackVScaleRangeArgs(getRawIntAttr(Attribute::VScaleRange)).second;
 }
 
 AttrBuilder &AttrBuilder::addAlignmentAttr(MaybeAlign Align) {
@@ -1669,7 +1696,7 @@ AttrBuilder &AttrBuilder::addAllocSizeAttrFromRawRepr(uint64_t RawArgs) {
 }
 
 AttrBuilder &AttrBuilder::addVScaleRangeAttr(unsigned MinValue,
-                                             unsigned MaxValue) {
+                                             Optional<unsigned> MaxValue) {
   return addVScaleRangeAttrFromRawRepr(packVScaleRangeArgs(MinValue, MaxValue));
 }
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d73d1e9c20b3..b8ad2b294b87 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -702,6 +702,31 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
       return true;
     }
+
+    if (Name == "arm.mve.vctp64" &&
+        cast<FixedVectorType>(F->getReturnType())->getNumElements() == 4) {
+      // A vctp64 returning a v4i1 is converted to return a v2i1. Rename the
+      // function and deal with it below in UpgradeIntrinsicCall.
+      rename(F);
+      return true;
+    }
+    // These too are changed to accept a v2i1 insteead of the old v4i1.
+    if (Name == "arm.mve.mull.int.predicated.v2i64.v4i32.v4i1" ||
+        Name == "arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1" ||
+        Name == "arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1" ||
+        Name == "arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1" ||
+        Name == "arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1" ||
+        Name == "arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1" ||
+        Name == "arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1" ||
+        Name == "arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1" ||
+        Name == "arm.cde.vcx1q.predicated.v2i64.v4i1" ||
+        Name == "arm.cde.vcx1qa.predicated.v2i64.v4i1" ||
+        Name == "arm.cde.vcx2q.predicated.v2i64.v4i1" ||
+        Name == "arm.cde.vcx2qa.predicated.v2i64.v4i1" ||
+        Name == "arm.cde.vcx3q.predicated.v2i64.v4i1" ||
+        Name == "arm.cde.vcx3qa.predicated.v2i64.v4i1")
+      return true;
+
     break;
   }
 
@@ -1803,6 +1828,96 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) {
   }
 }
 
+static Value *UpgradeARMIntrinsicCall(StringRef Name, CallInst *CI, Function *F,
+                                      IRBuilder<> &Builder) {
+  if (Name == "mve.vctp64.old") {
+    // Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the
+    // correct type.
+    Value *VCTP = Builder.CreateCall(
+        Intrinsic::getDeclaration(F->getParent(), Intrinsic::arm_mve_vctp64),
+        CI->getArgOperand(0), CI->getName());
+    Value *C1 = Builder.CreateCall(
+        Intrinsic::getDeclaration(
+            F->getParent(), Intrinsic::arm_mve_pred_v2i,
+            {VectorType::get(Builder.getInt1Ty(), 2, false)}),
+        VCTP);
+    return Builder.CreateCall(
+        Intrinsic::getDeclaration(
+            F->getParent(), Intrinsic::arm_mve_pred_i2v,
+            {VectorType::get(Builder.getInt1Ty(), 4, false)}),
+        C1);
+  } else if (Name == "mve.mull.int.predicated.v2i64.v4i32.v4i1" ||
+             Name == "mve.vqdmull.predicated.v2i64.v4i32.v4i1" ||
+             Name == "mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1" ||
+             Name == "mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1" ||
+             Name == "mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1" ||
+             Name == "mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1" ||
+             Name == "mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1" ||
+             Name == "mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1" ||
+             Name == "cde.vcx1q.predicated.v2i64.v4i1" ||
+             Name == "cde.vcx1qa.predicated.v2i64.v4i1" ||
+             Name == "cde.vcx2q.predicated.v2i64.v4i1" ||
+             Name == "cde.vcx2qa.predicated.v2i64.v4i1" ||
+             Name == "cde.vcx3q.predicated.v2i64.v4i1" ||
+             Name == "cde.vcx3qa.predicated.v2i64.v4i1") {
+    std::vector<Type *> Tys;
+    unsigned ID = CI->getIntrinsicID();
+    Type *V2I1Ty = FixedVectorType::get(Builder.getInt1Ty(), 2);
+    switch (ID) {
+    case Intrinsic::arm_mve_mull_int_predicated:
+    case Intrinsic::arm_mve_vqdmull_predicated:
+    case Intrinsic::arm_mve_vldr_gather_base_predicated:
+      Tys = {CI->getType(), CI->getOperand(0)->getType(), V2I1Ty};
+      break;
+    case Intrinsic::arm_mve_vldr_gather_base_wb_predicated:
+    case Intrinsic::arm_mve_vstr_scatter_base_predicated:
+    case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated:
+      Tys = {CI->getOperand(0)->getType(), CI->getOperand(0)->getType(),
+             V2I1Ty};
+      break;
+    case Intrinsic::arm_mve_vldr_gather_offset_predicated:
+      Tys = {CI->getType(), CI->getOperand(0)->getType(),
+             CI->getOperand(1)->getType(), V2I1Ty};
+      break;
+    case Intrinsic::arm_mve_vstr_scatter_offset_predicated:
+      Tys = {CI->getOperand(0)->getType(), CI->getOperand(1)->getType(),
+             CI->getOperand(2)->getType(), V2I1Ty};
+      break;
+    case Intrinsic::arm_cde_vcx1q_predicated:
+    case Intrinsic::arm_cde_vcx1qa_predicated:
+    case Intrinsic::arm_cde_vcx2q_predicated:
+    case Intrinsic::arm_cde_vcx2qa_predicated:
+    case Intrinsic::arm_cde_vcx3q_predicated:
+    case Intrinsic::arm_cde_vcx3qa_predicated:
+      Tys = {CI->getOperand(1)->getType(), V2I1Ty};
+      break;
+    default:
+      llvm_unreachable("Unhandled Intrinsic!");
+    }
+
+    std::vector<Value *> Ops;
+    for (Value *Op : CI->args()) {
+      Type *Ty = Op->getType();
+      if (Ty->getScalarSizeInBits() == 1) {
+        Value *C1 = Builder.CreateCall(
+            Intrinsic::getDeclaration(
+                F->getParent(), Intrinsic::arm_mve_pred_v2i,
+                {VectorType::get(Builder.getInt1Ty(), 4, false)}),
+            Op);
+        Op = Builder.CreateCall(
+            Intrinsic::getDeclaration(F->getParent(),
+                                      Intrinsic::arm_mve_pred_i2v, {V2I1Ty}),
+            C1);
+      }
+      Ops.push_back(Op);
+    }
+
+    Function *Fn = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+    return Builder.CreateCall(Fn, Ops, CI->getName());
+  }
+  llvm_unreachable("Unknown function for ARM CallInst upgrade.");
+}
+
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
 /// provided to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
@@ -1826,6 +1941,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     bool IsNVVM = Name.startswith("nvvm.");
     if (IsNVVM)
       Name = Name.substr(5);
+    bool IsARM = Name.startswith("arm.");
+    if (IsARM)
+      Name = Name.substr(4);
 
     if (IsX86 && Name.startswith("sse4a.movnt.")) {
       Module *M = F->getParent();
@@ -2289,14 +2407,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       if (CI->arg_size() >= 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("avx512.mask.loadu."))) {
-      Rep = UpgradeMaskedLoad(Builder, CI->getArgOperand(0),
-                              CI->getArgOperand(1), CI->getArgOperand(2),
-                              /*Aligned*/false);
-    } else if (IsX86 && (Name.startswith("avx512.mask.load."))) {
-      Rep = UpgradeMaskedLoad(Builder, CI->getArgOperand(0),
-                              CI->getArgOperand(1),CI->getArgOperand(2),
-                              /*Aligned*/true);
+    } else if (IsX86 && Name.startswith("avx512.mask.load")) {
+      // "avx512.mask.loadu." or "avx512.mask.load."
+      bool Aligned = Name[16] != 'u'; // "avx512.mask.loadu".
+      Rep =
+          UpgradeMaskedLoad(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+                            CI->getArgOperand(2), Aligned);
     } else if (IsX86 && Name.startswith("avx512.mask.expand.load.")) {
       auto *ResultTy = cast<FixedVectorType>(CI->getType());
       Type *PtrTy = ResultTy->getElementType();
@@ -3649,6 +3765,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                    F->getParent(), Intrinsic::convert_from_fp16,
                                    {Builder.getFloatTy()}),
                                CI->getArgOperand(0), "h2f");
+    } else if (IsARM) {
+      Rep = UpgradeARMIntrinsicCall(Name, CI, F, Builder);
     } else {
       llvm_unreachable("Unknown function for CallInst upgrade.");
     }
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index ed1956e0f7e9..7beafc485d09 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -450,8 +450,8 @@ BasicBlock *BasicBlock::splitBasicBlockBefore(iterator I, const Twine &BBName) {
 void BasicBlock::replacePhiUsesWith(BasicBlock *Old, BasicBlock *New) {
   // N.B. This might not be a complete BasicBlock, so don't assume
   // that it ends with a non-phi instruction.
-  for (iterator II = begin(), IE = end(); II != IE; ++II) {
-    PHINode *PN = dyn_cast<PHINode>(II);
+  for (Instruction &I : *this) {
+    PHINode *PN = dyn_cast<PHINode>(&I);
     if (!PN)
       break;
     PN->replaceIncomingBlockWith(Old, New);
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 437fd0558447..8668fe82601c 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1801,46 +1801,8 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
   } else if (isa<ConstantFP>(C1) && isa<ConstantFP>(C2)) {
     const APFloat &C1V = cast<ConstantFP>(C1)->getValueAPF();
     const APFloat &C2V = cast<ConstantFP>(C2)->getValueAPF();
-    APFloat::cmpResult R = C1V.compare(C2V);
-    switch (pred) {
-    default: llvm_unreachable("Invalid FCmp Predicate");
-    case FCmpInst::FCMP_FALSE: return Constant::getNullValue(ResultTy);
-    case FCmpInst::FCMP_TRUE:  return Constant::getAllOnesValue(ResultTy);
-    case FCmpInst::FCMP_UNO:
-      return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered);
-    case FCmpInst::FCMP_ORD:
-      return ConstantInt::get(ResultTy, R!=APFloat::cmpUnordered);
-    case FCmpInst::FCMP_UEQ:
-      return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered ||
-                                        R==APFloat::cmpEqual);
-    case FCmpInst::FCMP_OEQ:
-      return ConstantInt::get(ResultTy, R==APFloat::cmpEqual);
-    case FCmpInst::FCMP_UNE:
-      return ConstantInt::get(ResultTy, R!=APFloat::cmpEqual);
-    case FCmpInst::FCMP_ONE:
-      return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan ||
-                                        R==APFloat::cmpGreaterThan);
-    case FCmpInst::FCMP_ULT:
-      return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered ||
-                                        R==APFloat::cmpLessThan);
-    case FCmpInst::FCMP_OLT:
-      return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan);
-    case FCmpInst::FCMP_UGT:
-      return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered ||
-                                        R==APFloat::cmpGreaterThan);
-    case FCmpInst::FCMP_OGT:
-      return ConstantInt::get(ResultTy, R==APFloat::cmpGreaterThan);
-    case FCmpInst::FCMP_ULE:
-      return ConstantInt::get(ResultTy, R!=APFloat::cmpGreaterThan);
-    case FCmpInst::FCMP_OLE:
-      return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan ||
-                                        R==APFloat::cmpEqual);
-    case FCmpInst::FCMP_UGE:
-      return ConstantInt::get(ResultTy, R!=APFloat::cmpLessThan);
-    case FCmpInst::FCMP_OGE:
-      return ConstantInt::get(ResultTy, R==APFloat::cmpGreaterThan ||
-                                        R==APFloat::cmpEqual);
-    }
+    CmpInst::Predicate Predicate = CmpInst::Predicate(pred);
+    return ConstantInt::get(ResultTy, FCmpInst::compare(C1V, C2V, Predicate));
   } else if (auto *C1VTy = dyn_cast<VectorType>(C1->getType())) {
 
     // Fast path for splatted constants.
@@ -2215,9 +2177,8 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
 
   if (C->isNullValue()) {
     bool isNull = true;
-    for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
-      if (!isa<UndefValue>(Idxs[i]) &&
-          !cast<Constant>(Idxs[i])->isNullValue()) {
+    for (Value *Idx : Idxs)
+      if (!isa<UndefValue>(Idx) && !cast<Constant>(Idx)->isNullValue()) {
         isNull = false;
         break;
       }
@@ -2233,8 +2194,8 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
 
       // The GEP returns a vector of pointers when one of more of
       // its arguments is a vector.
-      for (unsigned i = 0, e = Idxs.size(); i != e; ++i) {
-        if (auto *VT = dyn_cast<VectorType>(Idxs[i]->getType())) {
+      for (Value *Idx : Idxs) {
+        if (auto *VT = dyn_cast<VectorType>(Idx->getType())) {
           assert((!isa<VectorType>(GEPTy) || isa<ScalableVectorType>(GEPTy) ==
                                                  isa<ScalableVectorType>(VT)) &&
                  "Mismatched GEPTy vector types");
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index c66cfb6e9ac1..837be910f6d8 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -535,6 +535,9 @@ void llvm::deleteConstant(Constant *C) {
   case Constant::DSOLocalEquivalentVal:
     delete static_cast<DSOLocalEquivalent *>(C);
     break;
+  case Constant::NoCFIValueVal:
+    delete static_cast<NoCFIValue *>(C);
+    break;
   case Constant::UndefValueVal:
     delete static_cast<UndefValue *>(C);
     break;
@@ -1296,9 +1299,10 @@ Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) {
   if (V.empty())
     return ConstantAggregateZero::get(Ty);
 
-  for (unsigned i = 0, e = V.size(); i != e; ++i) {
-    assert(V[i]->getType() == Ty->getElementType() &&
+  for (Constant *C : V) {
+    assert(C->getType() == Ty->getElementType() &&
            "Wrong type in array element initializer");
+    (void)C;
   }
 
   // If this is an all-zero array, return a ConstantAggregateZero object.  If
@@ -1364,12 +1368,12 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
     isZero = V[0]->isNullValue();
     // PoisonValue inherits UndefValue, so its check is not necessary.
     if (isUndef || isZero) {
-      for (unsigned i = 0, e = V.size(); i != e; ++i) {
-        if (!V[i]->isNullValue())
+      for (Constant *C : V) {
+        if (!C->isNullValue())
           isZero = false;
-        if (!isa<PoisonValue>(V[i]))
+        if (!isa<PoisonValue>(C))
           isPoison = false;
-        if (isa<PoisonValue>(V[i]) || !isa<UndefValue>(V[i]))
+        if (isa<PoisonValue>(C) || !isa<UndefValue>(C))
           isUndef = false;
       }
     }
@@ -1962,6 +1966,47 @@ Value *DSOLocalEquivalent::handleOperandChangeImpl(Value *From, Value *To) {
   return nullptr;
 }
 
+NoCFIValue *NoCFIValue::get(GlobalValue *GV) {
+  NoCFIValue *&NC = GV->getContext().pImpl->NoCFIValues[GV];
+  if (!NC)
+    NC = new NoCFIValue(GV);
+
+  assert(NC->getGlobalValue() == GV &&
+         "NoCFIValue does not match the expected global value");
+  return NC;
+}
+
+NoCFIValue::NoCFIValue(GlobalValue *GV)
+    : Constant(GV->getType(), Value::NoCFIValueVal, &Op<0>(), 1) {
+  setOperand(0, GV);
+}
+
+/// Remove the constant from the constant table.
+void NoCFIValue::destroyConstantImpl() {
+  const GlobalValue *GV = getGlobalValue();
+  GV->getContext().pImpl->NoCFIValues.erase(GV);
+}
+
+Value *NoCFIValue::handleOperandChangeImpl(Value *From, Value *To) {
+  assert(From == getGlobalValue() && "Changing value does not match operand.");
+
+  GlobalValue *GV = dyn_cast<GlobalValue>(To->stripPointerCasts());
+  assert(GV && "Can only replace the operands with a global value");
+
+  NoCFIValue *&NewNC = getContext().pImpl->NoCFIValues[GV];
+  if (NewNC)
+    return llvm::ConstantExpr::getBitCast(NewNC, getType());
+
+  getContext().pImpl->NoCFIValues.erase(getGlobalValue());
+  NewNC = this;
+  setOperand(0, GV);
+
+  if (GV->getType() != getType())
+    mutateType(GV->getType());
+
+  return nullptr;
+}
+
 //---- ConstantExpr::get() implementations.
 //
 
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 2c396ae97499..a263d2536541 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -1696,6 +1696,14 @@ LLVMValueRef LLVMConstGEP(LLVMValueRef ConstantVal,
   return wrap(ConstantExpr::getGetElementPtr(Ty, Val, IdxList));
 }
 
+LLVMValueRef LLVMConstGEP2(LLVMTypeRef Ty, LLVMValueRef ConstantVal,
+                           LLVMValueRef *ConstantIndices, unsigned NumIndices) {
+  ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices),
+                               NumIndices);
+  Constant *Val = unwrap<Constant>(ConstantVal);
+  return wrap(ConstantExpr::getGetElementPtr(unwrap(Ty), Val, IdxList));
+}
+
 LLVMValueRef LLVMConstInBoundsGEP(LLVMValueRef ConstantVal,
                                   LLVMValueRef *ConstantIndices,
                                   unsigned NumIndices) {
@@ -1707,6 +1715,15 @@ LLVMValueRef LLVMConstInBoundsGEP(LLVMValueRef ConstantVal,
   return wrap(ConstantExpr::getInBoundsGetElementPtr(Ty, Val, IdxList));
 }
 
+LLVMValueRef LLVMConstInBoundsGEP2(LLVMTypeRef Ty, LLVMValueRef ConstantVal,
+                                   LLVMValueRef *ConstantIndices,
+                                   unsigned NumIndices) {
+  ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices),
+                               NumIndices);
+  Constant *Val = unwrap<Constant>(ConstantVal);
+  return wrap(ConstantExpr::getInBoundsGetElementPtr(unwrap(Ty), Val, IdxList));
+}
+
 LLVMValueRef LLVMConstTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
   return wrap(ConstantExpr::getTrunc(unwrap<Constant>(ConstantVal),
                                      unwrap(ToType)));
@@ -3007,13 +3024,17 @@ LLVMTypeRef LLVMGetAllocatedType(LLVMValueRef Alloca) {
 /*--.. Operations on gep instructions (only) ...............................--*/
 
 LLVMBool LLVMIsInBounds(LLVMValueRef GEP) {
-  return unwrap<GetElementPtrInst>(GEP)->isInBounds();
+  return unwrap<GEPOperator>(GEP)->isInBounds();
 }
 
 void LLVMSetIsInBounds(LLVMValueRef GEP, LLVMBool InBounds) {
   return unwrap<GetElementPtrInst>(GEP)->setIsInBounds(InBounds);
 }
 
+LLVMTypeRef LLVMGetGEPSourceElementType(LLVMValueRef GEP) {
+  return wrap(unwrap<GEPOperator>(GEP)->getSourceElementType());
+}
+
 /*--.. Operations on phi nodes .............................................--*/
 
 void LLVMAddIncoming(LLVMValueRef PhiNode, LLVMValueRef *IncomingValues,
@@ -3039,7 +3060,7 @@ LLVMBasicBlockRef LLVMGetIncomingBlock(LLVMValueRef PhiNode, unsigned Index) {
 
 unsigned LLVMGetNumIndices(LLVMValueRef Inst) {
   auto *I = unwrap(Inst);
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+  if (auto *GEP = dyn_cast<GEPOperator>(I))
     return GEP->getNumIndices();
   if (auto *EV = dyn_cast<ExtractValueInst>(I))
     return EV->getNumIndices();
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 548962bd6a98..35af22034a12 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -671,11 +671,11 @@ DIBuilder::getOrCreateMacroArray(ArrayRef<Metadata *> Elements) {
 
 DITypeRefArray DIBuilder::getOrCreateTypeArray(ArrayRef<Metadata *> Elements) {
   SmallVector<llvm::Metadata *, 16> Elts;
-  for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
-    if (Elements[i] && isa<MDNode>(Elements[i]))
-      Elts.push_back(cast<DIType>(Elements[i]));
+  for (Metadata *E : Elements) {
+    if (isa_and_nonnull<MDNode>(E))
+      Elts.push_back(cast<DIType>(E));
     else
-      Elts.push_back(Elements[i]);
+      Elts.push_back(E);
   }
   return DITypeRefArray(MDNode::get(VMContext, Elts));
 }
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index 2ace18048262..61b2b13bfd03 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -124,26 +124,25 @@ LayoutAlignElem::operator==(const LayoutAlignElem &rhs) const {
 // PointerAlignElem, PointerAlign support
 //===----------------------------------------------------------------------===//
 
-PointerAlignElem PointerAlignElem::get(uint32_t AddressSpace, Align ABIAlign,
-                                       Align PrefAlign, uint32_t TypeByteWidth,
-                                       uint32_t IndexWidth) {
+PointerAlignElem PointerAlignElem::getInBits(uint32_t AddressSpace,
+                                             Align ABIAlign, Align PrefAlign,
+                                             uint32_t TypeBitWidth,
+                                             uint32_t IndexBitWidth) {
   assert(ABIAlign <= PrefAlign && "Preferred alignment worse than ABI!");
   PointerAlignElem retval;
   retval.AddressSpace = AddressSpace;
   retval.ABIAlign = ABIAlign;
   retval.PrefAlign = PrefAlign;
-  retval.TypeByteWidth = TypeByteWidth;
-  retval.IndexWidth = IndexWidth;
+  retval.TypeBitWidth = TypeBitWidth;
+  retval.IndexBitWidth = IndexBitWidth;
   return retval;
 }
 
 bool
 PointerAlignElem::operator==(const PointerAlignElem &rhs) const {
-  return (ABIAlign == rhs.ABIAlign
-          && AddressSpace == rhs.AddressSpace
-          && PrefAlign == rhs.PrefAlign
-          && TypeByteWidth == rhs.TypeByteWidth
-          && IndexWidth == rhs.IndexWidth);
+  return (ABIAlign == rhs.ABIAlign && AddressSpace == rhs.AddressSpace &&
+          PrefAlign == rhs.PrefAlign && TypeBitWidth == rhs.TypeBitWidth &&
+          IndexBitWidth == rhs.IndexBitWidth);
 }
 
 //===----------------------------------------------------------------------===//
@@ -197,7 +196,7 @@ void DataLayout::reset(StringRef Desc) {
                                  E.PrefAlign, E.TypeBitWidth))
       return report_fatal_error(std::move(Err));
   }
-  if (Error Err = setPointerAlignment(0, Align(8), Align(8), 8, 8))
+  if (Error Err = setPointerAlignmentInBits(0, Align(8), Align(8), 64, 64))
     return report_fatal_error(std::move(Err));
 
   if (Error Err = parseSpecifier(Desc))
@@ -318,7 +317,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
       if (Error Err = ::split(Rest, ':', Split))
         return Err;
       unsigned PointerMemSize;
-      if (Error Err = getIntInBytes(Tok, PointerMemSize))
+      if (Error Err = getInt(Tok, PointerMemSize))
         return Err;
       if (!PointerMemSize)
         return reportError("Invalid pointer size of 0 bytes");
@@ -354,13 +353,13 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
         if (!Rest.empty()) {
           if (Error Err = ::split(Rest, ':', Split))
             return Err;
-          if (Error Err = getIntInBytes(Tok, IndexSize))
+          if (Error Err = getInt(Tok, IndexSize))
             return Err;
           if (!IndexSize)
             return reportError("Invalid index size of 0 bytes");
         }
       }
-      if (Error Err = setPointerAlignment(
+      if (Error Err = setPointerAlignmentInBits(
               AddrSpace, assumeAligned(PointerABIAlign),
               assumeAligned(PointerPrefAlign), PointerMemSize, IndexSize))
         return Err;
@@ -603,9 +602,10 @@ DataLayout::getPointerAlignElem(uint32_t AddressSpace) const {
   return Pointers[0];
 }
 
-Error DataLayout::setPointerAlignment(uint32_t AddrSpace, Align ABIAlign,
-                                      Align PrefAlign, uint32_t TypeByteWidth,
-                                      uint32_t IndexWidth) {
+Error DataLayout::setPointerAlignmentInBits(uint32_t AddrSpace, Align ABIAlign,
+                                            Align PrefAlign,
+                                            uint32_t TypeBitWidth,
+                                            uint32_t IndexBitWidth) {
   if (PrefAlign < ABIAlign)
     return reportError(
         "Preferred alignment cannot be less than the ABI alignment");
@@ -615,13 +615,14 @@ Error DataLayout::setPointerAlignment(uint32_t AddrSpace, Align ABIAlign,
     return A.AddressSpace < AddressSpace;
   });
   if (I == Pointers.end() || I->AddressSpace != AddrSpace) {
-    Pointers.insert(I, PointerAlignElem::get(AddrSpace, ABIAlign, PrefAlign,
-                                             TypeByteWidth, IndexWidth));
+    Pointers.insert(I,
+                    PointerAlignElem::getInBits(AddrSpace, ABIAlign, PrefAlign,
+                                                TypeBitWidth, IndexBitWidth));
   } else {
     I->ABIAlign = ABIAlign;
     I->PrefAlign = PrefAlign;
-    I->TypeByteWidth = TypeByteWidth;
-    I->IndexWidth = IndexWidth;
+    I->TypeBitWidth = TypeBitWidth;
+    I->IndexBitWidth = IndexBitWidth;
   }
   return Error::success();
 }
@@ -704,13 +705,14 @@ Align DataLayout::getPointerPrefAlignment(unsigned AS) const {
 }
 
 unsigned DataLayout::getPointerSize(unsigned AS) const {
-  return getPointerAlignElem(AS).TypeByteWidth;
+  return divideCeil(getPointerAlignElem(AS).TypeBitWidth, 8);
 }
 
 unsigned DataLayout::getMaxIndexSize() const {
   unsigned MaxIndexSize = 0;
   for (auto &P : Pointers)
-    MaxIndexSize = std::max(MaxIndexSize, P.IndexWidth);
+    MaxIndexSize =
+        std::max(MaxIndexSize, (unsigned)divideCeil(P.TypeBitWidth, 8));
 
   return MaxIndexSize;
 }
@@ -723,7 +725,7 @@ unsigned DataLayout::getPointerTypeSizeInBits(Type *Ty) const {
 }
 
 unsigned DataLayout::getIndexSize(unsigned AS) const {
-  return getPointerAlignElem(AS).IndexWidth;
+  return divideCeil(getPointerAlignElem(AS).IndexBitWidth, 8);
 }
 
 unsigned DataLayout::getIndexTypeSizeInBits(Type *Ty) const {
@@ -901,16 +903,14 @@ int64_t DataLayout::getIndexedOffsetInType(Type *ElemTy,
   return Result;
 }
 
-static void addElementIndex(SmallVectorImpl<APInt> &Indices, TypeSize ElemSize,
-                            APInt &Offset) {
+static APInt getElementIndex(TypeSize ElemSize, APInt &Offset) {
   // Skip over scalable or zero size elements. Also skip element sizes larger
   // than the positive index space, because the arithmetic below may not be
   // correct in that case.
   unsigned BitWidth = Offset.getBitWidth();
   if (ElemSize.isScalable() || ElemSize == 0 ||
       !isUIntN(BitWidth - 1, ElemSize)) {
-    Indices.push_back(APInt::getZero(BitWidth));
-    return;
+    return APInt::getZero(BitWidth);
   }
 
   APInt Index = Offset.sdiv(ElemSize);
@@ -921,47 +921,52 @@ static void addElementIndex(SmallVectorImpl<APInt> &Indices, TypeSize ElemSize,
     Offset += ElemSize;
     assert(Offset.isNonNegative() && "Remaining offset shouldn't be negative");
   }
-  Indices.push_back(Index);
+  return Index;
 }
 
-SmallVector<APInt> DataLayout::getGEPIndicesForOffset(Type *&ElemTy,
-                                                      APInt &Offset) const {
-  assert(ElemTy->isSized() && "Element type must be sized");
-  SmallVector<APInt> Indices;
-  addElementIndex(Indices, getTypeAllocSize(ElemTy), Offset);
-  while (Offset != 0) {
-    if (auto *ArrTy = dyn_cast<ArrayType>(ElemTy)) {
-      ElemTy = ArrTy->getElementType();
-      addElementIndex(Indices, getTypeAllocSize(ElemTy), Offset);
-      continue;
-    }
+Optional<APInt> DataLayout::getGEPIndexForOffset(Type *&ElemTy,
+                                                 APInt &Offset) const {
+  if (auto *ArrTy = dyn_cast<ArrayType>(ElemTy)) {
+    ElemTy = ArrTy->getElementType();
+    return getElementIndex(getTypeAllocSize(ElemTy), Offset);
+  }
 
-    if (auto *VecTy = dyn_cast<VectorType>(ElemTy)) {
-      ElemTy = VecTy->getElementType();
-      unsigned ElemSizeInBits = getTypeSizeInBits(ElemTy).getFixedSize();
-      // GEPs over non-multiple of 8 size vector elements are invalid.
-      if (ElemSizeInBits % 8 != 0)
-        break;
+  if (auto *VecTy = dyn_cast<VectorType>(ElemTy)) {
+    ElemTy = VecTy->getElementType();
+    unsigned ElemSizeInBits = getTypeSizeInBits(ElemTy).getFixedSize();
+    // GEPs over non-multiple of 8 size vector elements are invalid.
+    if (ElemSizeInBits % 8 != 0)
+      return None;
 
-      addElementIndex(Indices, TypeSize::Fixed(ElemSizeInBits / 8), Offset);
-      continue;
-    }
+    return getElementIndex(TypeSize::Fixed(ElemSizeInBits / 8), Offset);
+  }
 
-    if (auto *STy = dyn_cast<StructType>(ElemTy)) {
-      const StructLayout *SL = getStructLayout(STy);
-      uint64_t IntOffset = Offset.getZExtValue();
-      if (IntOffset >= SL->getSizeInBytes())
-        break;
+  if (auto *STy = dyn_cast<StructType>(ElemTy)) {
+    const StructLayout *SL = getStructLayout(STy);
+    uint64_t IntOffset = Offset.getZExtValue();
+    if (IntOffset >= SL->getSizeInBytes())
+      return None;
 
-      unsigned Index = SL->getElementContainingOffset(IntOffset);
-      Offset -= SL->getElementOffset(Index);
-      ElemTy = STy->getElementType(Index);
-      Indices.push_back(APInt(32, Index));
-      continue;
-    }
+    unsigned Index = SL->getElementContainingOffset(IntOffset);
+    Offset -= SL->getElementOffset(Index);
+    ElemTy = STy->getElementType(Index);
+    return APInt(32, Index);
+  }
+
+  // Non-aggregate type.
+  return None;
+}
 
-    // Can't index into non-aggregate type.
-    break;
+SmallVector<APInt> DataLayout::getGEPIndicesForOffset(Type *&ElemTy,
+                                                      APInt &Offset) const {
+  assert(ElemTy->isSized() && "Element type must be sized");
+  SmallVector<APInt> Indices;
+  Indices.push_back(getElementIndex(getTypeAllocSize(ElemTy), Offset));
+  while (Offset != 0) {
+    Optional<APInt> Index = getGEPIndexForOffset(ElemTy, Offset);
+    if (!Index)
+      break;
+    Indices.push_back(*Index);
   }
 
   return Indices;
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 82b20a8af91b..f1a6402fb11b 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -980,7 +980,10 @@ enum IIT_Info {
   IIT_STRUCT9 = 49,
   IIT_V256 = 50,
   IIT_AMX  = 51,
-  IIT_PPCF128 = 52
+  IIT_PPCF128 = 52,
+  IIT_V3 = 53,
+  IIT_EXTERNREF = 54,
+  IIT_FUNCREF = 55
 };
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
@@ -1056,6 +1059,10 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
     OutputTable.push_back(IITDescriptor::getVector(2, IsScalableVector));
     DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
+  case IIT_V3:
+    OutputTable.push_back(IITDescriptor::getVector(3, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
   case IIT_V4:
     OutputTable.push_back(IITDescriptor::getVector(4, IsScalableVector));
     DecodeIITType(NextElt, Infos, Info, OutputTable);
@@ -1092,6 +1099,14 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
     OutputTable.push_back(IITDescriptor::getVector(1024, IsScalableVector));
     DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
+  case IIT_EXTERNREF:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 10));
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0));
+    return;
+  case IIT_FUNCREF:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 20));
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 8));
+    return;
   case IIT_PTR:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0));
     DecodeIITType(NextElt, Infos, Info, OutputTable);
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 9f38288095e3..b6bd25aa1234 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -126,7 +126,7 @@ void GlobalObject::setAlignment(MaybeAlign Align) {
 
 void GlobalObject::copyAttributesFrom(const GlobalObject *Src) {
   GlobalValue::copyAttributesFrom(Src);
-  setAlignment(MaybeAlign(Src->getAlignment()));
+  setAlignment(Src->getAlign());
   setSection(Src->getSection());
 }
 
@@ -249,7 +249,7 @@ bool GlobalObject::canIncreaseAlignment() const {
   // alignment specified. (If it is assigned a section, the global
   // could be densely packed with other objects in the section, and
   // increasing the alignment could cause padding issues.)
-  if (hasSection() && getAlignment() > 0)
+  if (hasSection() && getAlign().hasValue())
     return false;
 
   // On ELF platforms, we're further restricted in that we can't
diff --git a/llvm/lib/IR/InlineAsm.cpp b/llvm/lib/IR/InlineAsm.cpp
index 56932b457225..a0c48781ced5 100644
--- a/llvm/lib/IR/InlineAsm.cpp
+++ b/llvm/lib/IR/InlineAsm.cpp
@@ -262,12 +262,12 @@ bool InlineAsm::Verify(FunctionType *Ty, StringRef ConstStr) {
   unsigned NumOutputs = 0, NumInputs = 0, NumClobbers = 0;
   unsigned NumIndirect = 0;
 
-  for (unsigned i = 0, e = Constraints.size(); i != e; ++i) {
-    switch (Constraints[i].Type) {
+  for (const ConstraintInfo &Constraint : Constraints) {
+    switch (Constraint.Type) {
     case InlineAsm::isOutput:
       if ((NumInputs-NumIndirect) != 0 || NumClobbers != 0)
         return false;  // outputs before inputs and clobbers.
-      if (!Constraints[i].isIndirect) {
+      if (!Constraint.isIndirect) {
         ++NumOutputs;
         break;
       }
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index a4659da7e807..4480ec799c35 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -166,7 +166,10 @@ void Instruction::dropPoisonGeneratingFlags() {
     cast<GetElementPtrInst>(this)->setIsInBounds(false);
     break;
   }
-  // TODO: FastMathFlags!
+  if (isa<FPMathOperator>(this)) {
+    setHasNoNaNs(false);
+    setHasNoInfs(false);
+  }
 
   assert(!hasPoisonGeneratingFlags() && "must be kept in sync");
 }
@@ -436,17 +439,17 @@ static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2,
 
   if (const AllocaInst *AI = dyn_cast<AllocaInst>(I1))
     return AI->getAllocatedType() == cast<AllocaInst>(I2)->getAllocatedType() &&
-           (AI->getAlignment() == cast<AllocaInst>(I2)->getAlignment() ||
+           (AI->getAlign() == cast<AllocaInst>(I2)->getAlign() ||
             IgnoreAlignment);
   if (const LoadInst *LI = dyn_cast<LoadInst>(I1))
     return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() &&
-           (LI->getAlignment() == cast<LoadInst>(I2)->getAlignment() ||
+           (LI->getAlign() == cast<LoadInst>(I2)->getAlign() ||
             IgnoreAlignment) &&
            LI->getOrdering() == cast<LoadInst>(I2)->getOrdering() &&
            LI->getSyncScopeID() == cast<LoadInst>(I2)->getSyncScopeID();
   if (const StoreInst *SI = dyn_cast<StoreInst>(I1))
     return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() &&
-           (SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() ||
+           (SI->getAlign() == cast<StoreInst>(I2)->getAlign() ||
             IgnoreAlignment) &&
            SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() &&
            SI->getSyncScopeID() == cast<StoreInst>(I2)->getSyncScopeID();
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index ad27a6d8c08e..7798af3b19b9 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -1410,8 +1410,6 @@ bool AllocaInst::isStaticAlloca() const {
 void LoadInst::AssertOK() {
   assert(getOperand(0)->getType()->isPointerTy() &&
          "Ptr must have pointer type.");
-  assert(!(isAtomic() && getAlignment() == 0) &&
-         "Alignment required for atomic load");
 }
 
 static Align computeLoadStoreDefaultAlign(Type *Ty, BasicBlock *BB) {
@@ -1490,8 +1488,6 @@ void StoreInst::AssertOK() {
   assert(cast<PointerType>(getOperand(1)->getType())
              ->isOpaqueOrPointeeTypeMatches(getOperand(0)->getType()) &&
          "Ptr must be a pointer to Val type!");
-  assert(!(isAtomic() && getAlignment() == 0) &&
-         "Alignment required for atomic store");
 }
 
 StoreInst::StoreInst(Value *val, Value *addr, Instruction *InsertBefore)
@@ -2328,7 +2324,6 @@ bool ShuffleVectorInst::isInsertSubvectorMask(ArrayRef<int> Mask,
     }
     Src1Elts.setBit(i);
     Src1Identity &= (M == (i + NumSrcElts));
-    continue;
   }
   assert((Src0Elts | Src1Elts | UndefElts).isAllOnes() &&
          "unknown shuffle elements");
@@ -4165,6 +4160,47 @@ bool ICmpInst::compare(const APInt &LHS, const APInt &RHS,
   };
 }
 
+bool FCmpInst::compare(const APFloat &LHS, const APFloat &RHS,
+                       FCmpInst::Predicate Pred) {
+  APFloat::cmpResult R = LHS.compare(RHS);
+  switch (Pred) {
+  default:
+    llvm_unreachable("Invalid FCmp Predicate");
+  case FCmpInst::FCMP_FALSE:
+    return false;
+  case FCmpInst::FCMP_TRUE:
+    return true;
+  case FCmpInst::FCMP_UNO:
+    return R == APFloat::cmpUnordered;
+  case FCmpInst::FCMP_ORD:
+    return R != APFloat::cmpUnordered;
+  case FCmpInst::FCMP_UEQ:
+    return R == APFloat::cmpUnordered || R == APFloat::cmpEqual;
+  case FCmpInst::FCMP_OEQ:
+    return R == APFloat::cmpEqual;
+  case FCmpInst::FCMP_UNE:
+    return R != APFloat::cmpEqual;
+  case FCmpInst::FCMP_ONE:
+    return R == APFloat::cmpLessThan || R == APFloat::cmpGreaterThan;
+  case FCmpInst::FCMP_ULT:
+    return R == APFloat::cmpUnordered || R == APFloat::cmpLessThan;
+  case FCmpInst::FCMP_OLT:
+    return R == APFloat::cmpLessThan;
+  case FCmpInst::FCMP_UGT:
+    return R == APFloat::cmpUnordered || R == APFloat::cmpGreaterThan;
+  case FCmpInst::FCMP_OGT:
+    return R == APFloat::cmpGreaterThan;
+  case FCmpInst::FCMP_ULE:
+    return R != APFloat::cmpGreaterThan;
+  case FCmpInst::FCMP_OLE:
+    return R == APFloat::cmpLessThan || R == APFloat::cmpEqual;
+  case FCmpInst::FCMP_UGE:
+    return R != APFloat::cmpLessThan;
+  case FCmpInst::FCMP_OGE:
+    return R == APFloat::cmpGreaterThan || R == APFloat::cmpEqual;
+  }
+}
+
 CmpInst::Predicate CmpInst::getFlippedSignednessPredicate(Predicate pred) {
   assert(CmpInst::isRelational(pred) &&
          "Call only with non-equality predicates!");
@@ -4411,7 +4447,7 @@ void SwitchInstProfUpdateWrapper::addCase(
     Weights.getValue()[SI.getNumSuccessors() - 1] = *W;
   } else if (Weights) {
     Changed = true;
-    Weights.getValue().push_back(W ? *W : 0);
+    Weights.getValue().push_back(W.getValueOr(0));
   }
   if (Weights)
     assert(SI.getNumSuccessors() == Weights->size() &&
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 9206cd37a6d1..8f7318665cfb 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -468,6 +468,7 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const {
 }
 
 Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
+                                               Type *ReturnType,
                                                ArrayRef<Value *> Params) {
   assert(isVPIntrinsic(VPID) && "not a VP intrinsic");
   Function *VPFunc;
@@ -486,22 +487,15 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
     break;
   case Intrinsic::vp_load:
     VPFunc = Intrinsic::getDeclaration(
-        M, VPID,
-        {Params[0]->getType()->getPointerElementType(), Params[0]->getType()});
+        M, VPID, {ReturnType, Params[0]->getType()});
     break;
   case Intrinsic::vp_gather:
     VPFunc = Intrinsic::getDeclaration(
-        M, VPID,
-        {VectorType::get(cast<VectorType>(Params[0]->getType())
-                             ->getElementType()
-                             ->getPointerElementType(),
-                         cast<VectorType>(Params[0]->getType())),
-         Params[0]->getType()});
+        M, VPID, {ReturnType, Params[0]->getType()});
     break;
   case Intrinsic::vp_store:
     VPFunc = Intrinsic::getDeclaration(
-        M, VPID,
-        {Params[1]->getType()->getPointerElementType(), Params[1]->getType()});
+        M, VPID, {Params[0]->getType(), Params[1]->getType()});
     break;
   case Intrinsic::vp_scatter:
     VPFunc = Intrinsic::getDeclaration(
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index b2909c425846..24c4a348f4da 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -386,8 +386,9 @@ template <> struct MDNodeKeyImpl<DIEnumerator> {
         IsUnsigned(N->isUnsigned()) {}
 
   bool isKeyOf(const DIEnumerator *RHS) const {
-    return APInt::isSameValue(Value, RHS->getValue()) &&
-           IsUnsigned == RHS->isUnsigned() && Name == RHS->getRawName();
+    return Value.getBitWidth() == RHS->getValue().getBitWidth() &&
+           Value == RHS->getValue() && IsUnsigned == RHS->isUnsigned() &&
+           Name == RHS->getRawName();
   }
 
   unsigned getHashValue() const { return hash_combine(Value, Name); }
@@ -1424,6 +1425,8 @@ public:
 
   DenseMap<const GlobalValue *, DSOLocalEquivalent *> DSOLocalEquivalents;
 
+  DenseMap<const GlobalValue *, NoCFIValue *> NoCFIValues;
+
   ConstantUniqueMap<ConstantExpr> ExprConstants;
 
   ConstantUniqueMap<InlineAsm> InlineAsms;
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 7bccf09012ca..bb72bec93066 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -886,9 +886,8 @@ void PMDataManager::recordAvailableAnalysis(Pass *P) {
   // implements as well.
   const PassInfo *PInf = TPM->findAnalysisPassInfo(PI);
   if (!PInf) return;
-  const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
-  for (unsigned i = 0, e = II.size(); i != e; ++i)
-    AvailableAnalysis[II[i]->getTypeInfo()] = P;
+  for (const PassInfo *PI : PInf->getInterfacesImplemented())
+    AvailableAnalysis[PI->getTypeInfo()] = P;
 }
 
 // Return true if P preserves high level analysis used by other
@@ -1013,10 +1012,9 @@ void PMDataManager::freePass(Pass *P, StringRef Msg,
 
     // Remove all interfaces this pass implements, for which it is also
     // listed as the available implementation.
-    const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
-    for (unsigned i = 0, e = II.size(); i != e; ++i) {
-      DenseMap<AnalysisID, Pass*>::iterator Pos =
-        AvailableAnalysis.find(II[i]->getTypeInfo());
+    for (const PassInfo *PI : PInf->getInterfacesImplemented()) {
+      DenseMap<AnalysisID, Pass *>::iterator Pos =
+          AvailableAnalysis.find(PI->getTypeInfo());
       if (Pos != AvailableAnalysis.end() && Pos->second == P)
         AvailableAnalysis.erase(Pos);
     }
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 63ea41fba89a..a0485a59d0e0 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -750,8 +750,8 @@ void Module::setSDKVersion(const VersionTuple &V) {
                 ConstantDataArray::get(Context, Entries));
 }
 
-VersionTuple Module::getSDKVersion() const {
-  auto *CM = dyn_cast_or_null<ConstantAsMetadata>(getModuleFlag("SDK Version"));
+static VersionTuple getSDKVersionMD(Metadata *MD) {
+  auto *CM = dyn_cast_or_null<ConstantAsMetadata>(MD);
   if (!CM)
     return {};
   auto *Arr = dyn_cast_or_null<ConstantDataArray>(CM->getValue());
@@ -775,6 +775,10 @@ VersionTuple Module::getSDKVersion() const {
   return Result;
 }
 
+VersionTuple Module::getSDKVersion() const {
+  return getSDKVersionMD(getModuleFlag("SDK Version"));
+}
+
 GlobalVariable *llvm::collectUsedGlobalVariables(
     const Module &M, SmallVectorImpl<GlobalValue *> &Vec, bool CompilerUsed) {
   const char *Name = CompilerUsed ? "llvm.compiler.used" : "llvm.used";
@@ -809,3 +813,13 @@ void Module::setPartialSampleProfileRatio(const ModuleSummaryIndex &Index) {
     }
   }
 }
+
+StringRef Module::getDarwinTargetVariantTriple() const {
+  if (const auto *MD = getModuleFlag("darwin.target_variant.triple"))
+    return cast<MDString>(MD)->getString();
+  return "";
+}
+
+VersionTuple Module::getDarwinTargetVariantSDKVersion() const {
+  return getSDKVersionMD(getModuleFlag("darwin.target_variant.SDK Version"));
+}
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index 31c5cd938d03..a0ac7d3ad7d3 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -447,11 +447,17 @@ static std::string linkageToString(GlobalValue::LinkageTypes LT) {
 
 static std::string fflagsToString(FunctionSummary::FFlags F) {
   auto FlagValue = [](unsigned V) { return V ? '1' : '0'; };
-  char FlagRep[] = {FlagValue(F.ReadNone),  FlagValue(F.ReadOnly),
-                    FlagValue(F.NoRecurse), FlagValue(F.ReturnDoesNotAlias),
-                    FlagValue(F.NoInline),  FlagValue(F.AlwaysInline),
-                    FlagValue(F.NoUnwind),  FlagValue(F.MayThrow), 
-                    FlagValue(F.HasUnknownCall), 0};
+  char FlagRep[] = {FlagValue(F.ReadNone),
+                    FlagValue(F.ReadOnly),
+                    FlagValue(F.NoRecurse),
+                    FlagValue(F.ReturnDoesNotAlias),
+                    FlagValue(F.NoInline),
+                    FlagValue(F.AlwaysInline),
+                    FlagValue(F.NoUnwind),
+                    FlagValue(F.MayThrow),
+                    FlagValue(F.HasUnknownCall),
+                    FlagValue(F.MustBeUnreachable),
+                    0};
 
   return FlagRep;
 }
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index d15fcfbc5b9f..08c1fc931e2e 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -39,9 +39,10 @@ bool Operator::hasPoisonGeneratingFlags() const {
     return GEP->isInBounds() || GEP->getInRangeIndex() != None;
   }
   default:
+    if (const auto *FP = dyn_cast<FPMathOperator>(this))
+      return FP->hasNoNaNs() || FP->hasNoInfs();
     return false;
   }
-  // TODO: FastMathFlags!  (On instructions, but not constexpr)
 }
 
 Type *GEPOperator::getSourceElementType() const {
@@ -89,7 +90,7 @@ bool GEPOperator::accumulateConstantOffset(
   assert(Offset.getBitWidth() ==
              DL.getIndexSizeInBits(getPointerAddressSpace()) &&
          "The offset bit width does not match DL specification.");
-  SmallVector<const Value *> Index(value_op_begin() + 1, value_op_end());
+  SmallVector<const Value *> Index(llvm::drop_begin(operand_values()));
   return GEPOperator::accumulateConstantOffset(getSourceElementType(), Index,
                                                DL, Offset, ExternalAnalysis);
 }
diff --git a/llvm/lib/IR/SSAContext.cpp b/llvm/lib/IR/SSAContext.cpp
new file mode 100644
index 000000000000..a96e39f32882
--- /dev/null
+++ b/llvm/lib/IR/SSAContext.cpp
@@ -0,0 +1,47 @@
+//===- SSAContext.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a specialization of the GenericSSAContext<X>
+/// template class for LLVM IR.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/SSAContext.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+BasicBlock *SSAContext::getEntryBlock(Function &F) {
+  return &F.getEntryBlock();
+}
+
+void SSAContext::setFunction(Function &Fn) { F = &Fn; }
+
+Printable SSAContext::print(Value *V) const {
+  return Printable([V](raw_ostream &Out) { V->print(Out); });
+}
+
+Printable SSAContext::print(Instruction *Inst) const {
+  return print(cast<Value>(Inst));
+}
+
+Printable SSAContext::print(BasicBlock *BB) const {
+  if (BB->hasName())
+    return Printable([BB](raw_ostream &Out) { Out << BB->getName(); });
+
+  return Printable([BB](raw_ostream &Out) {
+    ModuleSlotTracker MST{BB->getParent()->getParent(), false};
+    MST.incorporateFunction(*BB->getParent());
+    Out << MST.getLocalSlot(BB);
+  });
+}
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index b475c8327874..8741ed917f9f 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -928,7 +928,7 @@ Align Value::getPointerAlignment(const DataLayout &DL) const {
       }
       llvm_unreachable("Unhandled FunctionPtrAlignType");
     }
-    const MaybeAlign Alignment(GO->getAlignment());
+    const MaybeAlign Alignment(GO->getAlign());
     if (!Alignment) {
       if (auto *GVar = dyn_cast<GlobalVariable>(GO)) {
         Type *ObjectType = GVar->getValueType();
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 154b59835b01..fb7c423e54e2 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -543,7 +543,7 @@ private:
 
   void verifySwiftErrorCall(CallBase &Call, const Value *SwiftErrorVal);
   void verifySwiftErrorValue(const Value *SwiftErrorVal);
-  void verifyTailCCMustTailAttrs(AttrBuilder Attrs, StringRef Context);
+  void verifyTailCCMustTailAttrs(const AttrBuilder &Attrs, StringRef Context);
   void verifyMustTailCall(CallInst &CI);
   bool verifyAttributeCount(AttributeList Attrs, unsigned Params);
   void verifyAttributeTypes(AttributeSet Attrs, const Value *V);
@@ -553,8 +553,6 @@ private:
   void verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
                            const Value *V, bool IsIntrinsic);
   void verifyFunctionMetadata(ArrayRef<std::pair<unsigned, MDNode *>> MDs);
-  template <typename T>
-  void verifyODRTypeAsScopeOperand(const MDNode &MD, T * = nullptr);
 
   void visitConstantExprsRecursively(const Constant *EntryC);
   void visitConstantExpr(const ConstantExpr *CE);
@@ -604,26 +602,35 @@ void Verifier::visit(Instruction &I) {
   InstVisitor<Verifier>::visit(I);
 }
 
-// Helper to recursively iterate over indirect users. By
-// returning false, the callback can ask to stop recursing
-// further.
+// Helper to iterate over indirect users. By returning false, the callback can ask to stop traversing further.
 static void forEachUser(const Value *User,
                         SmallPtrSet<const Value *, 32> &Visited,
                         llvm::function_ref<bool(const Value *)> Callback) {
   if (!Visited.insert(User).second)
     return;
-  for (const Value *TheNextUser : User->materialized_users())
-    if (Callback(TheNextUser))
-      forEachUser(TheNextUser, Visited, Callback);
+
+  SmallVector<const Value *> WorkList;
+  append_range(WorkList, User->materialized_users());
+  while (!WorkList.empty()) {
+   const Value *Cur = WorkList.pop_back_val();
+    if (!Visited.insert(Cur).second)
+      continue;
+    if (Callback(Cur))
+      append_range(WorkList, Cur->materialized_users());
+  }
 }
 
 void Verifier::visitGlobalValue(const GlobalValue &GV) {
   Assert(!GV.isDeclaration() || GV.hasValidDeclarationLinkage(),
          "Global is external, but doesn't have external or weak linkage!", &GV);
 
-  if (const GlobalObject *GO = dyn_cast<GlobalObject>(&GV))
-    Assert(GO->getAlignment() <= Value::MaximumAlignment,
-           "huge alignment values are unsupported", GO);
+  if (const GlobalObject *GO = dyn_cast<GlobalObject>(&GV)) {
+
+    if (MaybeAlign A = GO->getAlign()) {
+      Assert(A->value() <= Value::MaximumAlignment,
+             "huge alignment values are unsupported", GO);
+    }
+  }
   Assert(!GV.hasAppendingLinkage() || isa<GlobalVariable>(GV),
          "Only global variables can have appending linkage!", &GV);
 
@@ -733,8 +740,9 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
           Value *V = Op->stripPointerCasts();
           Assert(isa<GlobalVariable>(V) || isa<Function>(V) ||
                      isa<GlobalAlias>(V),
-                 "invalid llvm.used member", V);
-          Assert(V->hasName(), "members of llvm.used must be named", V);
+                 Twine("invalid ") + GV.getName() + " member", V);
+          Assert(V->hasName(),
+                 Twine("members of ") + GV.getName() + " must be named", V);
         }
       }
     }
@@ -860,19 +868,6 @@ void Verifier::visitNamedMDNode(const NamedMDNode &NMD) {
   }
 }
 
-template <typename T>
-void Verifier::verifyODRTypeAsScopeOperand(const MDNode &MD, T *) {
-  if (isa<T>(MD)) {
-    if (auto *N = dyn_cast_or_null<DICompositeType>(cast<T>(MD).getScope()))
-      // Of all the supported tags for DICompositeType(see visitDICompositeType)
-      // we know that enum type cannot be a scope.
-      AssertDI(N->getTag() != dwarf::DW_TAG_enumeration_type,
-               "enum type is not a scope; check enum type ODR "
-               "violation",
-               N, &MD);
-  }
-}
-
 void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) {
   // Only visit each node once.  Metadata can be mutually recursive, so this
   // avoids infinite recursion here, as well as being an optimization.
@@ -882,12 +877,6 @@ void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) {
   Assert(&MD.getContext() == &Context,
          "MDNode context does not match Module context!", &MD);
 
-  // Makes sure when a scope operand is a ODR type, the ODR type uniquing does
-  // not create invalid debug metadata.
-  // TODO: check that the non-ODR-type scope operand is valid.
-  verifyODRTypeAsScopeOperand<DIType>(MD);
-  verifyODRTypeAsScopeOperand<DILocalScope>(MD);
-
   switch (MD.getMetadataID()) {
   default:
     llvm_unreachable("Invalid MDNode subclass");
@@ -2055,10 +2044,12 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
   }
 
   if (Attrs.hasFnAttr(Attribute::VScaleRange)) {
-    std::pair<unsigned, unsigned> Args =
-        Attrs.getFnAttrs().getVScaleRangeArgs();
+    unsigned VScaleMin = Attrs.getFnAttrs().getVScaleRangeMin();
+    if (VScaleMin == 0)
+      CheckFailed("'vscale_range' minimum must be greater than 0", V);
 
-    if (Args.first > Args.second && Args.second != 0)
+    Optional<unsigned> VScaleMax = Attrs.getFnAttrs().getVScaleRangeMax();
+    if (VScaleMax && VScaleMin > VScaleMax)
       CheckFailed("'vscale_range' minimum cannot be greater than maximum", V);
   }
 
@@ -3328,7 +3319,7 @@ void Verifier::visitCallBase(CallBase &Call) {
   visitInstruction(Call);
 }
 
-void Verifier::verifyTailCCMustTailAttrs(AttrBuilder Attrs,
+void Verifier::verifyTailCCMustTailAttrs(const AttrBuilder &Attrs,
                                          StringRef Context) {
   Assert(!Attrs.contains(Attribute::InAlloca),
          Twine("inalloca attribute not allowed in ") + Context);
@@ -3733,15 +3724,15 @@ void Verifier::visitLoadInst(LoadInst &LI) {
   PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType());
   Assert(PTy, "Load operand must be a pointer.", &LI);
   Type *ElTy = LI.getType();
-  Assert(LI.getAlignment() <= Value::MaximumAlignment,
-         "huge alignment values are unsupported", &LI);
+  if (MaybeAlign A = LI.getAlign()) {
+    Assert(A->value() <= Value::MaximumAlignment,
+           "huge alignment values are unsupported", &LI);
+  }
   Assert(ElTy->isSized(), "loading unsized types is not allowed", &LI);
   if (LI.isAtomic()) {
     Assert(LI.getOrdering() != AtomicOrdering::Release &&
                LI.getOrdering() != AtomicOrdering::AcquireRelease,
            "Load cannot have Release ordering", &LI);
-    Assert(LI.getAlignment() != 0,
-           "Atomic load must specify explicit alignment", &LI);
     Assert(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
            "atomic load operand must have integer, pointer, or floating point "
            "type!",
@@ -3761,15 +3752,15 @@ void Verifier::visitStoreInst(StoreInst &SI) {
   Type *ElTy = SI.getOperand(0)->getType();
   Assert(PTy->isOpaqueOrPointeeTypeMatches(ElTy),
          "Stored value type does not match pointer operand type!", &SI, ElTy);
-  Assert(SI.getAlignment() <= Value::MaximumAlignment,
-         "huge alignment values are unsupported", &SI);
+  if (MaybeAlign A = SI.getAlign()) {
+    Assert(A->value() <= Value::MaximumAlignment,
+           "huge alignment values are unsupported", &SI);
+  }
   Assert(ElTy->isSized(), "storing unsized types is not allowed", &SI);
   if (SI.isAtomic()) {
     Assert(SI.getOrdering() != AtomicOrdering::Acquire &&
                SI.getOrdering() != AtomicOrdering::AcquireRelease,
            "Store cannot have Acquire ordering", &SI);
-    Assert(SI.getAlignment() != 0,
-           "Atomic store must specify explicit alignment", &SI);
     Assert(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
            "atomic store operand must have integer, pointer, or floating point "
            "type!",
@@ -3820,8 +3811,10 @@ void Verifier::visitAllocaInst(AllocaInst &AI) {
          "Cannot allocate unsized type", &AI);
   Assert(AI.getArraySize()->getType()->isIntegerTy(),
          "Alloca array size must have integer type", &AI);
-  Assert(AI.getAlignment() <= Value::MaximumAlignment,
-         "huge alignment values are unsupported", &AI);
+  if (MaybeAlign A = AI.getAlign()) {
+    Assert(A->value() <= Value::MaximumAlignment,
+           "huge alignment values are unsupported", &AI);
+  }
 
   if (AI.isSwiftError()) {
     verifySwiftErrorValue(&AI);
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 6ce2ed265739..f26ef4b21996 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1106,7 +1106,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
 
   if (Conf.PreOptModuleHook &&
       !Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule))
-    return Error::success();
+    return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
   if (!Conf.CodeGenOnly) {
     for (const auto &R : GlobalResolutions) {
@@ -1132,7 +1132,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
 
     if (Conf.PostInternalizeModuleHook &&
         !Conf.PostInternalizeModuleHook(0, *RegularLTO.CombinedModule))
-      return Error::success();
+      return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   }
 
   if (!RegularLTO.EmptyCombinedModule || Conf.AlwaysEmitRegularLTOObj) {
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index be06556b0c3b..855d0fc8a8be 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -37,7 +37,6 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -413,6 +412,8 @@ static void codegen(const Config &Conf, TargetMachine *TM,
   if (Error Err = StreamOrErr.takeError())
     report_fatal_error(std::move(Err));
   std::unique_ptr<CachedFileStream> &Stream = *StreamOrErr;
+  TM->Options.ObjectFilenameForDebug = Stream->ObjectPathName;
+
   legacy::PassManager CodeGenPasses;
   CodeGenPasses.add(
       createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex));
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index 088e45c9e8dc..fdc9896aca78 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -135,9 +135,8 @@ LTOCodeGenerator::LTOCodeGenerator(LLVMContext &Context)
 LTOCodeGenerator::~LTOCodeGenerator() {}
 
 void LTOCodeGenerator::setAsmUndefinedRefs(LTOModule *Mod) {
-  const std::vector<StringRef> &undefs = Mod->getAsmUndefinedRefs();
-  for (int i = 0, e = undefs.size(); i != e; ++i)
-    AsmUndefinedRefs.insert(undefs[i]);
+  for (const StringRef &Undef : Mod->getAsmUndefinedRefs())
+    AsmUndefinedRefs.insert(Undef);
 }
 
 bool LTOCodeGenerator::addModule(LTOModule *Mod) {
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 9474d8c9dafb..9aea27f0fdba 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -378,7 +378,8 @@ std::unique_ptr<MemoryBuffer> codegenModule(Module &TheModule,
     // Run codegen now. resulting binary is in OutputBuffer.
     PM.run(TheModule);
   }
-  return std::make_unique<SmallVectorMemoryBuffer>(std::move(OutputBuffer));
+  return std::make_unique<SmallVectorMemoryBuffer>(
+      std::move(OutputBuffer), /*RequiresNullTerminator=*/false);
 }
 
 /// Manage caching for a single Module.
@@ -541,7 +542,8 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
       auto Index = buildModuleSummaryIndex(TheModule, nullptr, &PSI);
       WriteBitcodeToFile(TheModule, OS, true, &Index);
     }
-    return std::make_unique<SmallVectorMemoryBuffer>(std::move(OutputBuffer));
+    return std::make_unique<SmallVectorMemoryBuffer>(
+        std::move(OutputBuffer), /*RequiresNullTerminator=*/false);
   }
 
   return codegenModule(TheModule, TM);
diff --git a/llvm/lib/LineEditor/LineEditor.cpp b/llvm/lib/LineEditor/LineEditor.cpp
index 1aa3476eb357..37c4b79f8e29 100644
--- a/llvm/lib/LineEditor/LineEditor.cpp
+++ b/llvm/lib/LineEditor/LineEditor.cpp
@@ -69,9 +69,8 @@ LineEditor::ListCompleterConcept::complete(StringRef Buffer, size_t Pos) const {
   // common prefix will then be empty.
   if (CommonPrefix.empty()) {
     Action.Kind = CompletionAction::AK_ShowCompletions;
-    for (std::vector<Completion>::iterator I = Comps.begin(), E = Comps.end();
-         I != E; ++I)
-      Action.Completions.push_back(I->DisplayText);
+    for (const Completion &Comp : Comps)
+      Action.Completions.push_back(Comp.DisplayText);
   } else {
     Action.Kind = CompletionAction::AK_Insert;
     Action.Text = CommonPrefix;
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index bad483be197d..b475ea81d107 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -646,7 +646,7 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
                          /*init*/ nullptr, SGVar->getName(),
                          /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
                          SGVar->getAddressSpace());
-  NewDGV->setAlignment(MaybeAlign(SGVar->getAlignment()));
+  NewDGV->setAlignment(SGVar->getAlign());
   NewDGV->copyAttributesFrom(SGVar);
   return NewDGV;
 }
@@ -877,7 +877,7 @@ IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
     if (DstGV->isConstant() != SrcGV->isConstant())
       return stringErr("Appending variables linked with different const'ness!");
 
-    if (DstGV->getAlignment() != SrcGV->getAlignment())
+    if (DstGV->getAlign() != SrcGV->getAlign())
       return stringErr(
           "Appending variables with different alignment need to be linked!");
 
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 2ca921017171..5c2aaddff4d1 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -168,9 +168,14 @@ public:
                       unsigned Update, VersionTuple SDKVersion) override;
   void emitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor,
                         unsigned Update, VersionTuple SDKVersion) override;
+  void emitDarwinTargetVariantBuildVersion(unsigned Platform, unsigned Major,
+                                           unsigned Minor, unsigned Update,
+                                           VersionTuple SDKVersion) override;
   void emitThumbFunc(MCSymbol *Func) override;
 
   void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
+  void emitConditionalAssignment(MCSymbol *Symbol,
+                                 const MCExpr *Value) override;
   void emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override;
   bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
 
@@ -640,6 +645,12 @@ void MCAsmStreamer::emitBuildVersion(unsigned Platform, unsigned Major,
   EmitEOL();
 }
 
+void MCAsmStreamer::emitDarwinTargetVariantBuildVersion(
+    unsigned Platform, unsigned Major, unsigned Minor, unsigned Update,
+    VersionTuple SDKVersion) {
+  emitBuildVersion(Platform, Major, Minor, Update, SDKVersion);
+}
+
 void MCAsmStreamer::emitThumbFunc(MCSymbol *Func) {
   // This needs to emit to a temporary string to get properly quoted
   // MCSymbols when they have spaces in them.
@@ -670,6 +681,15 @@ void MCAsmStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   MCStreamer::emitAssignment(Symbol, Value);
 }
 
+void MCAsmStreamer::emitConditionalAssignment(MCSymbol *Symbol,
+                                              const MCExpr *Value) {
+  OS << ".lto_set_conditional ";
+  Symbol->print(OS, MAI);
+  OS << ", ";
+  Value->print(OS, MAI);
+  EmitEOL();
+}
+
 void MCAsmStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
   OS << ".weakref ";
   Alias->print(OS, MAI);
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index d5e9f4fc66bc..a8837bbf57c7 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -89,6 +89,7 @@ MCAssembler::MCAssembler(MCContext &Context,
       BundleAlignSize(0), RelaxAll(false), SubsectionsViaSymbols(false),
       IncrementalLinkerCompatible(false), ELFHeaderEFlags(0) {
   VersionInfo.Major = 0; // Major version == 0 for "none specified"
+  DarwinTargetVariantVersionInfo.Major = 0;
 }
 
 MCAssembler::~MCAssembler() = default;
@@ -109,6 +110,8 @@ void MCAssembler::reset() {
   LOHContainer.reset();
   VersionInfo.Major = 0;
   VersionInfo.SDKVersion = VersionTuple();
+  DarwinTargetVariantVersionInfo.Major = 0;
+  DarwinTargetVariantVersionInfo.SDKVersion = VersionTuple();
 
   // reset objects owned by us
   if (getBackendPtr())
diff --git a/llvm/lib/MC/MCInstrAnalysis.cpp b/llvm/lib/MC/MCInstrAnalysis.cpp
index 52b59185c6fc..4ed1c6286a72 100644
--- a/llvm/lib/MC/MCInstrAnalysis.cpp
+++ b/llvm/lib/MC/MCInstrAnalysis.cpp
@@ -39,4 +39,4 @@ Optional<uint64_t>
 MCInstrAnalysis::getMemoryOperandRelocationOffset(const MCInst &Inst,
                                                   uint64_t Size) const {
   return None;
-}
-\ No newline at end of file
+}
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index aa94b141d8be..3edf7a3f49e6 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -92,6 +92,9 @@ public:
                       unsigned Update, VersionTuple SDKVersion) override;
   void emitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor,
                         unsigned Update, VersionTuple SDKVersion) override;
+  void emitDarwinTargetVariantBuildVersion(unsigned Platform, unsigned Major,
+                                           unsigned Minor, unsigned Update,
+                                           VersionTuple SDKVersion) override;
   void emitThumbFunc(MCSymbol *Func) override;
   bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
   void emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
@@ -283,6 +286,13 @@ void MCMachOStreamer::emitBuildVersion(unsigned Platform, unsigned Major,
                                  Update, SDKVersion);
 }
 
+void MCMachOStreamer::emitDarwinTargetVariantBuildVersion(
+    unsigned Platform, unsigned Major, unsigned Minor, unsigned Update,
+    VersionTuple SDKVersion) {
+  getAssembler().setDarwinTargetVariantBuildVersion(
+      (MachO::PlatformType)Platform, Major, Minor, Update, SDKVersion);
+}
+
 void MCMachOStreamer::emitThumbFunc(MCSymbol *Symbol) {
   // Remember that the function is a thumb function. Fixup and relocation
   // values will need adjusted.
@@ -516,7 +526,10 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context,
       new MCMachOStreamer(Context, std::move(MAB), std::move(OW), std::move(CE),
                           DWARFMustBeAtTheEnd, LabelSections);
   const Triple &Target = Context.getTargetTriple();
-  S->emitVersionForTarget(Target, Context.getObjectFileInfo()->getSDKVersion());
+  S->emitVersionForTarget(
+      Target, Context.getObjectFileInfo()->getSDKVersion(),
+      Context.getObjectFileInfo()->getDarwinTargetVariantTriple(),
+      Context.getObjectFileInfo()->getDarwinTargetVariantSDKVersion());
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   return S;
diff --git a/llvm/lib/MC/MCNullStreamer.cpp b/llvm/lib/MC/MCNullStreamer.cpp
index 291d840b4f4b..40b7eba58b03 100644
--- a/llvm/lib/MC/MCNullStreamer.cpp
+++ b/llvm/lib/MC/MCNullStreamer.cpp
@@ -40,6 +40,9 @@ namespace {
     void EmitCOFFSymbolStorageClass(int StorageClass) override {}
     void EmitCOFFSymbolType(int Type) override {}
     void EndCOFFSymbolDef() override {}
+    void
+    emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol, MCSymbolAttr Linkage,
+                                         MCSymbolAttr Visibility) override {}
   };
 
 }
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 9c86fcc86bcb..6604d7988c4c 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -281,6 +281,18 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
     Symbol->setOffset(0);
     addPendingLabel(Symbol);
   }
+
+  emitPendingAssignments(Symbol);
+}
+
+void MCObjectStreamer::emitPendingAssignments(MCSymbol *Symbol) {
+  auto Assignments = pendingAssignments.find(Symbol);
+  if (Assignments != pendingAssignments.end()) {
+    for (const PendingAssignment &A : Assignments->second)
+      emitAssignment(A.Symbol, A.Value);
+
+    pendingAssignments.erase(Assignments);
+  }
 }
 
 // Emit a label at a previously emitted fragment/offset position. This must be
@@ -353,6 +365,19 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
 void MCObjectStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   getAssembler().registerSymbol(*Symbol);
   MCStreamer::emitAssignment(Symbol, Value);
+  emitPendingAssignments(Symbol);
+}
+
+void MCObjectStreamer::emitConditionalAssignment(MCSymbol *Symbol,
+                                                 const MCExpr *Value) {
+  const MCSymbol *Target = &cast<MCSymbolRefExpr>(*Value).getSymbol();
+
+  // If the symbol already exists, emit the assignment. Otherwise, emit it
+  // later only if the symbol is also emitted.
+  if (Target->isRegistered())
+    emitAssignment(Symbol, Value);
+  else
+    pendingAssignments[Target].push_back({Symbol, Value});
 }
 
 bool MCObjectStreamer::mayHaveInstructions(MCSection &Sec) const {
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index ed9f2066dc20..705f7159d55b 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -356,8 +356,14 @@ private:
   /// return the contents from the current token up to the end or comma.
   StringRef parseStringToComma();
 
-  bool parseAssignment(StringRef Name, bool allow_redef,
-                       bool NoDeadStrip = false);
+  enum class AssignmentKind {
+    Set,
+    Equiv,
+    Equal,
+    LTOSetConditional,
+  };
+
+  bool parseAssignment(StringRef Name, AssignmentKind Kind);
 
   unsigned getBinOpPrecedence(AsmToken::TokenKind K,
                               MCBinaryExpr::Opcode &Kind);
@@ -534,6 +540,7 @@ private:
     DK_ADDRSIG_SYM,
     DK_PSEUDO_PROBE,
     DK_LTO_DISCARD,
+    DK_LTO_SET_CONDITIONAL,
     DK_END
   };
 
@@ -564,8 +571,8 @@ private:
                                const fltSemantics &); // ".single", ...
   bool parseDirectiveFill(); // ".fill"
   bool parseDirectiveZero(); // ".zero"
-  // ".set", ".equ", ".equiv"
-  bool parseDirectiveSet(StringRef IDVal, bool allow_redef);
+  // ".set", ".equ", ".equiv", ".lto_set_conditional"
+  bool parseDirectiveSet(StringRef IDVal, AssignmentKind Kind);
   bool parseDirectiveOrg(); // ".org"
   // ".align{,32}", ".p2align{,w,l}"
   bool parseDirectiveAlign(bool IsPow2, unsigned ValueSize);
@@ -1968,7 +1975,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     // identifier '=' ... -> assignment statement
     Lex();
 
-    return parseAssignment(IDVal, true);
+    return parseAssignment(IDVal, AssignmentKind::Equal);
 
   default: // Normal instruction or directive.
     break;
@@ -2027,9 +2034,11 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       break;
     case DK_SET:
     case DK_EQU:
-      return parseDirectiveSet(IDVal, true);
+      return parseDirectiveSet(IDVal, AssignmentKind::Set);
     case DK_EQUIV:
-      return parseDirectiveSet(IDVal, false);
+      return parseDirectiveSet(IDVal, AssignmentKind::Equiv);
+    case DK_LTO_SET_CONDITIONAL:
+      return parseDirectiveSet(IDVal, AssignmentKind::LTOSetConditional);
     case DK_ASCII:
       return parseDirectiveAscii(IDVal, false);
     case DK_ASCIZ:
@@ -2925,11 +2934,13 @@ void AsmParser::handleMacroExit() {
   ActiveMacros.pop_back();
 }
 
-bool AsmParser::parseAssignment(StringRef Name, bool allow_redef,
-                                bool NoDeadStrip) {
+bool AsmParser::parseAssignment(StringRef Name, AssignmentKind Kind) {
   MCSymbol *Sym;
   const MCExpr *Value;
-  if (MCParserUtils::parseAssignmentExpression(Name, allow_redef, *this, Sym,
+  SMLoc ExprLoc = getTok().getLoc();
+  bool AllowRedef =
+      Kind == AssignmentKind::Set || Kind == AssignmentKind::Equal;
+  if (MCParserUtils::parseAssignmentExpression(Name, AllowRedef, *this, Sym,
                                                Value))
     return true;
 
@@ -2944,9 +2955,22 @@ bool AsmParser::parseAssignment(StringRef Name, bool allow_redef,
     return false;
 
   // Do the assignment.
-  Out.emitAssignment(Sym, Value);
-  if (NoDeadStrip)
+  switch (Kind) {
+  case AssignmentKind::Equal:
+    Out.emitAssignment(Sym, Value);
+    break;
+  case AssignmentKind::Set:
+  case AssignmentKind::Equiv:
+    Out.emitAssignment(Sym, Value);
     Out.emitSymbolAttribute(Sym, MCSA_NoDeadStrip);
+    break;
+  case AssignmentKind::LTOSetConditional:
+    if (Value->getKind() != MCExpr::SymbolRef)
+      return Error(ExprLoc, "expected identifier");
+
+    Out.emitConditionalAssignment(Sym, Value);
+    break;
+  }
 
   return false;
 }
@@ -2998,10 +3022,11 @@ bool AsmParser::parseIdentifier(StringRef &Res) {
 ///   ::= .equ identifier ',' expression
 ///   ::= .equiv identifier ',' expression
 ///   ::= .set identifier ',' expression
-bool AsmParser::parseDirectiveSet(StringRef IDVal, bool allow_redef) {
+///   ::= .lto_set_conditional identifier ',' expression
+bool AsmParser::parseDirectiveSet(StringRef IDVal, AssignmentKind Kind) {
   StringRef Name;
   if (check(parseIdentifier(Name), "expected identifier") || parseComma() ||
-      parseAssignment(Name, allow_redef, true))
+      parseAssignment(Name, Kind))
     return true;
   return false;
 }
@@ -5581,6 +5606,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".addrsig_sym"] = DK_ADDRSIG_SYM;
   DirectiveKindMap[".pseudoprobe"] = DK_PSEUDO_PROBE;
   DirectiveKindMap[".lto_discard"] = DK_LTO_DISCARD;
+  DirectiveKindMap[".lto_set_conditional"] = DK_LTO_SET_CONDITIONAL;
 }
 
 MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
@@ -6012,12 +6038,13 @@ bool AsmParser::parseMSInlineAsm(
 
       bool isOutput = (i == 1) && Desc.mayStore();
       SMLoc Start = SMLoc::getFromPointer(SymName.data());
+      int64_t Size = Operand.isMemPlaceholder(Desc) ? 0 : SymName.size();
       if (isOutput) {
         ++InputIdx;
         OutputDecls.push_back(OpDecl);
         OutputDeclsAddressOf.push_back(Operand.needAddressOf());
         OutputConstraints.push_back(("=" + Constraint).str());
-        AsmStrRewrites.emplace_back(AOK_Output, Start, SymName.size());
+        AsmStrRewrites.emplace_back(AOK_Output, Start, Size);
       } else {
         InputDecls.push_back(OpDecl);
         InputDeclsAddressOf.push_back(Operand.needAddressOf());
@@ -6025,7 +6052,7 @@ bool AsmParser::parseMSInlineAsm(
         if (Desc.OpInfo[i - 1].isBranchTarget())
           AsmStrRewrites.emplace_back(AOK_CallInput, Start, SymName.size());
         else
-          AsmStrRewrites.emplace_back(AOK_Input, Start, SymName.size());
+          AsmStrRewrites.emplace_back(AOK_Input, Start, Size);
       }
     }
 
@@ -6140,13 +6167,17 @@ bool AsmParser::parseMSInlineAsm(
       OS << Ctx.getAsmInfo()->getPrivateLabelPrefix() << AR.Label;
       break;
     case AOK_Input:
-      OS << '$' << InputIdx++;
+      if (AR.Len)
+        OS << '$' << InputIdx;
+      ++InputIdx;
       break;
     case AOK_CallInput:
       OS << "${" << InputIdx++ << ":P}";
       break;
     case AOK_Output:
-      OS << '$' << OutputIdx++;
+      if (AR.Len)
+        OS << '$' << OutputIdx;
+      ++OutputIdx;
       break;
     case AOK_SizeDirective:
       switch (AR.Val) {
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index e35bcec8fe75..ebf38327f4dc 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -151,8 +151,8 @@ void MCPseudoProbeInlineTree::emit(MCObjectStreamer *MCOS,
   // InlineSite is unique for each pair,
   // so there will be no ordering of Inlinee based on MCPseudoProbeInlineTree*
   std::map<InlineSite, MCPseudoProbeInlineTree *> Inlinees;
-  for (auto Child = Children.begin(); Child != Children.end(); ++Child)
-    Inlinees[Child->first] = Child->second.get();
+  for (auto &Child : Children)
+    Inlinees[Child.first] = Child.second.get();
 
   for (const auto &Inlinee : Inlinees) {
     if (Guid) {
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index f4e64b42c817..9c37a7bebe2a 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -431,6 +431,9 @@ void MCStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
     TS->emitLabel(Symbol);
 }
 
+void MCStreamer::emitConditionalAssignment(MCSymbol *Symbol,
+                                           const MCExpr *Value) {}
+
 void MCStreamer::emitCFISections(bool EH, bool Debug) {}
 
 void MCStreamer::emitCFIStartProc(bool IsSimple, SMLoc Loc) {
@@ -1308,45 +1311,78 @@ getMachoBuildVersionPlatformType(const Triple &Target) {
   llvm_unreachable("unexpected OS type");
 }
 
-void MCStreamer::emitVersionForTarget(const Triple &Target,
-                                      const VersionTuple &SDKVersion) {
+void MCStreamer::emitVersionForTarget(
+    const Triple &Target, const VersionTuple &SDKVersion,
+    const Triple *DarwinTargetVariantTriple,
+    const VersionTuple &DarwinTargetVariantSDKVersion) {
   if (!Target.isOSBinFormatMachO() || !Target.isOSDarwin())
     return;
   // Do we even know the version?
   if (Target.getOSMajorVersion() == 0)
     return;
 
-  unsigned Major = 0;
-  unsigned Minor = 0;
-  unsigned Update = 0;
+  VersionTuple Version;
   switch (Target.getOS()) {
   case Triple::MacOSX:
   case Triple::Darwin:
-    Target.getMacOSXVersion(Major, Minor, Update);
+    Target.getMacOSXVersion(Version);
     break;
   case Triple::IOS:
   case Triple::TvOS:
-    Target.getiOSVersion(Major, Minor, Update);
+    Version = Target.getiOSVersion();
     break;
   case Triple::WatchOS:
-    Target.getWatchOSVersion(Major, Minor, Update);
+    Version = Target.getWatchOSVersion();
     break;
   default:
     llvm_unreachable("unexpected OS type");
   }
-  assert(Major != 0 && "A non-zero major version is expected");
-  auto LinkedTargetVersion = targetVersionOrMinimumSupportedOSVersion(
-      Target, VersionTuple(Major, Minor, Update));
+  assert(Version.getMajor() != 0 && "A non-zero major version is expected");
+  auto LinkedTargetVersion =
+      targetVersionOrMinimumSupportedOSVersion(Target, Version);
   auto BuildVersionOSVersion = getMachoBuildVersionSupportedOS(Target);
+  bool ShouldEmitBuildVersion = false;
   if (BuildVersionOSVersion.empty() ||
-      LinkedTargetVersion >= BuildVersionOSVersion)
-    return emitBuildVersion(getMachoBuildVersionPlatformType(Target),
-                            LinkedTargetVersion.getMajor(),
-                            *LinkedTargetVersion.getMinor(),
-                            *LinkedTargetVersion.getSubminor(), SDKVersion);
+      LinkedTargetVersion >= BuildVersionOSVersion) {
+    if (Target.isMacCatalystEnvironment() && DarwinTargetVariantTriple &&
+        DarwinTargetVariantTriple->isMacOSX()) {
+      emitVersionForTarget(*DarwinTargetVariantTriple,
+                           DarwinTargetVariantSDKVersion,
+                           /*TargetVariantTriple=*/nullptr,
+                           /*TargetVariantSDKVersion=*/VersionTuple());
+      emitDarwinTargetVariantBuildVersion(
+          getMachoBuildVersionPlatformType(Target),
+          LinkedTargetVersion.getMajor(),
+          LinkedTargetVersion.getMinor().getValueOr(0),
+          LinkedTargetVersion.getSubminor().getValueOr(0), SDKVersion);
+      return;
+    }
+    emitBuildVersion(getMachoBuildVersionPlatformType(Target),
+                     LinkedTargetVersion.getMajor(),
+                     LinkedTargetVersion.getMinor().getValueOr(0),
+                     LinkedTargetVersion.getSubminor().getValueOr(0),
+                     SDKVersion);
+    ShouldEmitBuildVersion = true;
+  }
+
+  if (const Triple *TVT = DarwinTargetVariantTriple) {
+    if (Target.isMacOSX() && TVT->isMacCatalystEnvironment()) {
+      auto TVLinkedTargetVersion =
+          targetVersionOrMinimumSupportedOSVersion(*TVT, TVT->getiOSVersion());
+      emitDarwinTargetVariantBuildVersion(
+          getMachoBuildVersionPlatformType(*TVT),
+          TVLinkedTargetVersion.getMajor(),
+          TVLinkedTargetVersion.getMinor().getValueOr(0),
+          TVLinkedTargetVersion.getSubminor().getValueOr(0),
+          DarwinTargetVariantSDKVersion);
+    }
+  }
+
+  if (ShouldEmitBuildVersion)
+    return;
 
   emitVersionMin(getMachoVersionMinLoadCommandType(Target),
                  LinkedTargetVersion.getMajor(),
-                 *LinkedTargetVersion.getMinor(),
-                 *LinkedTargetVersion.getSubminor(), SDKVersion);
+                 LinkedTargetVersion.getMinor().getValueOr(0),
+                 LinkedTargetVersion.getSubminor().getValueOr(0), SDKVersion);
 }
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index 7773d8828931..2a93c352c68a 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -351,7 +351,7 @@ static uint32_t ARM64CountOfUnwindCodes(ArrayRef<WinEH::Instruction> Insns) {
 // Unwind opcode encodings and restrictions are documented at
 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
 static void ARM64EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
-                                WinEH::Instruction &inst) {
+                                const WinEH::Instruction &inst) {
   uint8_t b, reg;
   switch (static_cast<Win64EH::UnwindOpcodes>(inst.Operation)) {
   default:
@@ -1050,10 +1050,8 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info,
   // Emit epilog unwind instructions
   for (auto &I : info->EpilogMap) {
     auto &EpilogInstrs = I.second;
-    for (uint32_t i = 0; i < EpilogInstrs.size(); i++) {
-      WinEH::Instruction inst = EpilogInstrs[i];
+    for (const WinEH::Instruction &inst : EpilogInstrs)
       ARM64EmitUnwindCode(streamer, info->Begin, inst);
-    }
   }
 
   int32_t BytesMod = CodeWords * 4 - TotalCodeBytes;
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 277d88cf1cd2..16941b1cb727 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -484,15 +484,15 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) {
 
   // Report errors for use of .indirect_symbol not in a symbol pointer section
   // or stub section.
-  for (MCAssembler::indirect_symbol_iterator it = Asm.indirect_symbol_begin(),
-         ie = Asm.indirect_symbol_end(); it != ie; ++it) {
-    const MCSectionMachO &Section = cast<MCSectionMachO>(*it->Section);
+  for (IndirectSymbolData &ISD : llvm::make_range(Asm.indirect_symbol_begin(),
+                                                  Asm.indirect_symbol_end())) {
+    const MCSectionMachO &Section = cast<MCSectionMachO>(*ISD.Section);
 
     if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS &&
         Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS &&
         Section.getType() != MachO::S_THREAD_LOCAL_VARIABLE_POINTERS &&
         Section.getType() != MachO::S_SYMBOL_STUBS) {
-      MCSymbol &Symbol = *it->Symbol;
+      MCSymbol &Symbol = *ISD.Symbol;
       report_fatal_error("indirect symbol '" + Symbol.getName() +
                          "' not in a symbol pointer or stub section");
     }
@@ -779,6 +779,17 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
       LoadCommandsSize += sizeof(MachO::version_min_command);
   }
 
+  const MCAssembler::VersionInfoType &TargetVariantVersionInfo =
+      Layout.getAssembler().getDarwinTargetVariantVersionInfo();
+
+  // Add the target variant version info load command size, if used.
+  if (TargetVariantVersionInfo.Major != 0) {
+    ++NumLoadCommands;
+    assert(TargetVariantVersionInfo.EmitBuildVersion &&
+           "target variant should use build version");
+    LoadCommandsSize += sizeof(MachO::build_version_command);
+  }
+
   // Add the data-in-code load command size, if used.
   unsigned NumDataRegions = Asm.getDataRegions().size();
   if (NumDataRegions) {
@@ -862,38 +873,43 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
   }
 
   // Write out the deployment target information, if it's available.
-  if (VersionInfo.Major != 0) {
-    auto EncodeVersion = [](VersionTuple V) -> uint32_t {
-      assert(!V.empty() && "empty version");
-      unsigned Update = V.getSubminor() ? *V.getSubminor() : 0;
-      unsigned Minor = V.getMinor() ? *V.getMinor() : 0;
-      assert(Update < 256 && "unencodable update target version");
-      assert(Minor < 256 && "unencodable minor target version");
-      assert(V.getMajor() < 65536 && "unencodable major target version");
-      return Update | (Minor << 8) | (V.getMajor() << 16);
-    };
-    uint32_t EncodedVersion = EncodeVersion(
-        VersionTuple(VersionInfo.Major, VersionInfo.Minor, VersionInfo.Update));
-    uint32_t SDKVersion = !VersionInfo.SDKVersion.empty()
-                              ? EncodeVersion(VersionInfo.SDKVersion)
-                              : 0;
-    if (VersionInfo.EmitBuildVersion) {
-      // FIXME: Currently empty tools. Add clang version in the future.
-      W.write<uint32_t>(MachO::LC_BUILD_VERSION);
-      W.write<uint32_t>(sizeof(MachO::build_version_command));
-      W.write<uint32_t>(VersionInfo.TypeOrPlatform.Platform);
-      W.write<uint32_t>(EncodedVersion);
-      W.write<uint32_t>(SDKVersion);
-      W.write<uint32_t>(0);         // Empty tools list.
-    } else {
-      MachO::LoadCommandType LCType
-        = getLCFromMCVM(VersionInfo.TypeOrPlatform.Type);
-      W.write<uint32_t>(LCType);
-      W.write<uint32_t>(sizeof(MachO::version_min_command));
-      W.write<uint32_t>(EncodedVersion);
-      W.write<uint32_t>(SDKVersion);
-    }
-  }
+  auto EmitDeploymentTargetVersion =
+      [&](const MCAssembler::VersionInfoType &VersionInfo) {
+        auto EncodeVersion = [](VersionTuple V) -> uint32_t {
+          assert(!V.empty() && "empty version");
+          unsigned Update = V.getSubminor().getValueOr(0);
+          unsigned Minor = V.getMinor().getValueOr(0);
+          assert(Update < 256 && "unencodable update target version");
+          assert(Minor < 256 && "unencodable minor target version");
+          assert(V.getMajor() < 65536 && "unencodable major target version");
+          return Update | (Minor << 8) | (V.getMajor() << 16);
+        };
+        uint32_t EncodedVersion = EncodeVersion(VersionTuple(
+            VersionInfo.Major, VersionInfo.Minor, VersionInfo.Update));
+        uint32_t SDKVersion = !VersionInfo.SDKVersion.empty()
+                                  ? EncodeVersion(VersionInfo.SDKVersion)
+                                  : 0;
+        if (VersionInfo.EmitBuildVersion) {
+          // FIXME: Currently empty tools. Add clang version in the future.
+          W.write<uint32_t>(MachO::LC_BUILD_VERSION);
+          W.write<uint32_t>(sizeof(MachO::build_version_command));
+          W.write<uint32_t>(VersionInfo.TypeOrPlatform.Platform);
+          W.write<uint32_t>(EncodedVersion);
+          W.write<uint32_t>(SDKVersion);
+          W.write<uint32_t>(0); // Empty tools list.
+        } else {
+          MachO::LoadCommandType LCType =
+              getLCFromMCVM(VersionInfo.TypeOrPlatform.Type);
+          W.write<uint32_t>(LCType);
+          W.write<uint32_t>(sizeof(MachO::version_min_command));
+          W.write<uint32_t>(EncodedVersion);
+          W.write<uint32_t>(SDKVersion);
+        }
+      };
+  if (VersionInfo.Major != 0)
+    EmitDeploymentTargetVersion(VersionInfo);
+  if (TargetVariantVersionInfo.Major != 0)
+    EmitDeploymentTargetVersion(TargetVariantVersionInfo);
 
   // Write the data-in-code load command, if used.
   uint64_t DataInCodeTableEnd = RelocTableEnd + NumDataRegions * 8;
diff --git a/llvm/lib/MC/TargetRegistry.cpp b/llvm/lib/MC/TargetRegistry.cpp
index 0948a6b9f1a1..09684b1e5ad2 100644
--- a/llvm/lib/MC/TargetRegistry.cpp
+++ b/llvm/lib/MC/TargetRegistry.cpp
@@ -124,10 +124,10 @@ void TargetRegistry::printRegisteredTargetsForVersion(raw_ostream &OS) {
   array_pod_sort(Targets.begin(), Targets.end(), TargetArraySortFn);
 
   OS << "  Registered Targets:\n";
-  for (unsigned i = 0, e = Targets.size(); i != e; ++i) {
-    OS << "    " << Targets[i].first;
-    OS.indent(Width - Targets[i].first.size()) << " - "
-      << Targets[i].second->getShortDescription() << '\n';
+  for (const auto &Target : Targets) {
+    OS << "    " << Target.first;
+    OS.indent(Width - Target.first.size())
+        << " - " << Target.second->getShortDescription() << '\n';
   }
   if (Targets.empty())
     OS << "    (none)\n";
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index ce997464caa7..da8bcec7f3d4 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -696,7 +696,7 @@ writeArchiveToBuffer(ArrayRef<NewArchiveMember> NewMembers, bool WriteSymtab,
     return std::move(E);
 
   return std::make_unique<SmallVectorMemoryBuffer>(
-      std::move(ArchiveBufferVector));
+      std::move(ArchiveBufferVector), /*RequiresNullTerminator=*/false);
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index 84181ae5e501..6e56da1a31f3 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -210,6 +210,8 @@ uint32_t llvm::object::getELFRelativeRelocationType(uint32_t Machine) {
     return ELF::R_SPARC_RELATIVE;
   case ELF::EM_CSKY:
     return ELF::R_CKCORE_RELATIVE;
+  case ELF::EM_VE:
+    return ELF::R_VE_RELATIVE;
   case ELF::EM_AMDGPU:
     break;
   case ELF::EM_BPF:
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index 7501661591f0..42e257516f4e 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -26,12 +26,15 @@
 #include "llvm/Object/SymbolicFile.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -4719,3 +4722,46 @@ StringRef MachOObjectFile::mapDebugSectionName(StringRef Name) const {
       .Case("debug_str_offs", "debug_str_offsets")
       .Default(Name);
 }
+
+Expected<std::vector<std::string>>
+MachOObjectFile::findDsymObjectMembers(StringRef Path) {
+  SmallString<256> BundlePath(Path);
+  // Normalize input path. This is necessary to accept `bundle.dSYM/`.
+  sys::path::remove_dots(BundlePath);
+  if (!sys::fs::is_directory(BundlePath) ||
+      sys::path::extension(BundlePath) != ".dSYM")
+    return std::vector<std::string>();
+  sys::path::append(BundlePath, "Contents", "Resources", "DWARF");
+  bool IsDir;
+  auto EC = sys::fs::is_directory(BundlePath, IsDir);
+  if (EC == errc::no_such_file_or_directory || (!EC && !IsDir))
+    return createStringError(
+        EC, "%s: expected directory 'Contents/Resources/DWARF' in dSYM bundle",
+        Path.str().c_str());
+  if (EC)
+    return createFileError(BundlePath, errorCodeToError(EC));
+
+  std::vector<std::string> ObjectPaths;
+  for (sys::fs::directory_iterator Dir(BundlePath, EC), DirEnd;
+       Dir != DirEnd && !EC; Dir.increment(EC)) {
+    StringRef ObjectPath = Dir->path();
+    sys::fs::file_status Status;
+    if (auto EC = sys::fs::status(ObjectPath, Status))
+      return createFileError(ObjectPath, errorCodeToError(EC));
+    switch (Status.type()) {
+    case sys::fs::file_type::regular_file:
+    case sys::fs::file_type::symlink_file:
+    case sys::fs::file_type::type_unknown:
+      ObjectPaths.push_back(ObjectPath.str());
+      break;
+    default: /*ignore*/;
+    }
+  }
+  if (EC)
+    return createFileError(BundlePath, errorCodeToError(EC));
+  if (ObjectPaths.empty())
+    return createStringError(std::error_code(),
+                             "%s: no objects found in dSYM bundle",
+                             Path.str().c_str());
+  return ObjectPaths;
+}
diff --git a/llvm/lib/Object/MachOUniversalWriter.cpp b/llvm/lib/Object/MachOUniversalWriter.cpp
index 9673c97a10f0..ae1ff09a4f8f 100644
--- a/llvm/lib/Object/MachOUniversalWriter.cpp
+++ b/llvm/lib/Object/MachOUniversalWriter.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOUniversal.h"
-#include "llvm/Support/SmallVectorMemoryBuffer.h"
 
 using namespace llvm;
 using namespace object;
diff --git a/llvm/lib/ObjectYAML/COFFEmitter.cpp b/llvm/lib/ObjectYAML/COFFEmitter.cpp
index 66ad16db1ba4..d884e2fd55cd 100644
--- a/llvm/lib/ObjectYAML/COFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/COFFEmitter.cpp
@@ -64,11 +64,7 @@ struct COFFParser {
   }
 
   bool parseSections() {
-    for (std::vector<COFFYAML::Section>::iterator i = Obj.Sections.begin(),
-                                                  e = Obj.Sections.end();
-         i != e; ++i) {
-      COFFYAML::Section &Sec = *i;
-
+    for (COFFYAML::Section &Sec : Obj.Sections) {
       // If the name is less than 8 bytes, store it in place, otherwise
       // store it in the string table.
       StringRef Name = Sec.Name;
@@ -103,11 +99,7 @@ struct COFFParser {
   }
 
   bool parseSymbols() {
-    for (std::vector<COFFYAML::Symbol>::iterator i = Obj.Symbols.begin(),
-                                                 e = Obj.Symbols.end();
-         i != e; ++i) {
-      COFFYAML::Symbol &Sym = *i;
-
+    for (COFFYAML::Symbol &Sym : Obj.Symbols) {
       // If the name is less than 8 bytes, store it in place, otherwise
       // store it in the string table.
       StringRef Name = Sym.Name;
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index e0dde4433d24..9b9266998ea6 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -464,29 +464,31 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_MIPS_ARCH_64R6, EF_MIPS_ARCH);
     break;
   case ELF::EM_HEXAGON:
-    BCase(EF_HEXAGON_MACH_V2);
-    BCase(EF_HEXAGON_MACH_V3);
-    BCase(EF_HEXAGON_MACH_V4);
-    BCase(EF_HEXAGON_MACH_V5);
-    BCase(EF_HEXAGON_MACH_V55);
-    BCase(EF_HEXAGON_MACH_V60);
-    BCase(EF_HEXAGON_MACH_V62);
-    BCase(EF_HEXAGON_MACH_V65);
-    BCase(EF_HEXAGON_MACH_V66);
-    BCase(EF_HEXAGON_MACH_V67);
-    BCase(EF_HEXAGON_MACH_V67T);
-    BCase(EF_HEXAGON_MACH_V68);
-    BCase(EF_HEXAGON_ISA_V2);
-    BCase(EF_HEXAGON_ISA_V3);
-    BCase(EF_HEXAGON_ISA_V4);
-    BCase(EF_HEXAGON_ISA_V5);
-    BCase(EF_HEXAGON_ISA_V55);
-    BCase(EF_HEXAGON_ISA_V60);
-    BCase(EF_HEXAGON_ISA_V62);
-    BCase(EF_HEXAGON_ISA_V65);
-    BCase(EF_HEXAGON_ISA_V66);
-    BCase(EF_HEXAGON_ISA_V67);
-    BCase(EF_HEXAGON_ISA_V68);
+    BCaseMask(EF_HEXAGON_MACH_V2, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V3, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V4, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V5, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V55, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V60, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V62, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V65, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V66, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V67, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V67T, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V68, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V69, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_ISA_V2, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V3, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V4, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V5, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V55, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V60, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V62, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V65, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V66, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V67, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V68, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V69, EF_HEXAGON_ISA);
     break;
   case ELF::EM_AVR:
     BCaseMask(EF_AVR_ARCH_AVR1, EF_AVR_ARCH_MASK);
diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
index 85d1f82bfafc..cf0d058c518c 100644
--- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
@@ -86,13 +86,13 @@ bool XCOFFWriter::nameShouldBeInStringTable(StringRef SymbolName) {
 }
 
 bool XCOFFWriter::initRelocations(uint64_t &CurrentOffset) {
-  for (uint16_t I = 0, E = InitSections.size(); I < E; ++I) {
-    if (!InitSections[I].Relocations.empty()) {
-      InitSections[I].NumberOfRelocations = InitSections[I].Relocations.size();
-      InitSections[I].FileOffsetToRelocations = CurrentOffset;
+  for (XCOFFYAML::Section &InitSection : InitSections) {
+    if (!InitSection.Relocations.empty()) {
+      InitSection.NumberOfRelocations = InitSection.Relocations.size();
+      InitSection.FileOffsetToRelocations = CurrentOffset;
       uint64_t RelSize = Is64Bit ? XCOFF::RelocationSerializationSize64
                                  : XCOFF::RelocationSerializationSize32;
-      CurrentOffset += InitSections[I].NumberOfRelocations * RelSize;
+      CurrentOffset += InitSection.NumberOfRelocations * RelSize;
       if (CurrentOffset > MaxRawDataSize) {
         ErrHandler("maximum object size of" + Twine(MaxRawDataSize) +
                    "exceeded when writing relocation data");
diff --git a/llvm/lib/ObjectYAML/YAML.cpp b/llvm/lib/ObjectYAML/YAML.cpp
index 5dcb113d3395..54e8c627d5a1 100644
--- a/llvm/lib/ObjectYAML/YAML.cpp
+++ b/llvm/lib/ObjectYAML/YAML.cpp
@@ -30,9 +30,8 @@ StringRef yaml::ScalarTraits<yaml::BinaryRef>::input(StringRef Scalar, void *,
     return "BinaryRef hex string must contain an even number of nybbles.";
   // TODO: Can we improve YAMLIO to permit a more accurate diagnostic here?
   // (e.g. a caret pointing to the offending character).
-  for (unsigned I = 0, N = Scalar.size(); I != N; ++I)
-    if (!llvm::isHexDigit(Scalar[I]))
-      return "BinaryRef hex string must contain only hex digits.";
+  if (!llvm::all_of(Scalar, llvm::isHexDigit))
+    return "BinaryRef hex string must contain only hex digits.";
   Val = yaml::BinaryRef(Scalar);
   return {};
 }
diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp
index 19e05b9272bb..c93b7ad7f5fa 100644
--- a/llvm/lib/Option/OptTable.cpp
+++ b/llvm/lib/Option/OptTable.cpp
@@ -591,16 +591,16 @@ static void PrintHelpOptionList(raw_ostream &OS, StringRef Title,
 
   // Find the maximum option length.
   unsigned OptionFieldWidth = 0;
-  for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) {
+  for (const OptionInfo &Opt : OptionHelp) {
     // Limit the amount of padding we are willing to give up for alignment.
-    unsigned Length = OptionHelp[i].Name.size();
+    unsigned Length = Opt.Name.size();
     if (Length <= 23)
       OptionFieldWidth = std::max(OptionFieldWidth, Length);
   }
 
   const unsigned InitialPad = 2;
-  for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) {
-    const std::string &Option = OptionHelp[i].Name;
+  for (const OptionInfo &Opt : OptionHelp) {
+    const std::string &Option = Opt.Name;
     int Pad = OptionFieldWidth - int(Option.size());
     OS.indent(InitialPad) << Option;
 
@@ -609,7 +609,7 @@ static void PrintHelpOptionList(raw_ostream &OS, StringRef Title,
       OS << "\n";
       Pad = OptionFieldWidth + InitialPad;
     }
-    OS.indent(Pad + 1) << OptionHelp[i].HelpText << '\n';
+    OS.indent(Pad + 1) << Opt.HelpText << '\n';
   }
 }
 
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 561a881bab0c..d7615ef4e9bf 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CostModel.h"
+#include "llvm/Analysis/CycleAnalysis.h"
 #include "llvm/Analysis/DDG.h"
 #include "llvm/Analysis/DDGPrinter.h"
 #include "llvm/Analysis/Delinearization.h"
@@ -151,6 +152,7 @@
 #include "llvm/Transforms/Scalar/DeadStoreElimination.h"
 #include "llvm/Transforms/Scalar/DivRemPairs.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/Scalar/FlattenCFG.h"
 #include "llvm/Transforms/Scalar/Float2Int.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/GuardWidening.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index de1b0ace7876..a6a36ff25402 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -178,6 +178,10 @@ static cl::opt<bool> EnableNoRerunSimplificationPipeline(
         "than once in the case that SCC mutations cause a function to be "
         "visited multiple times as long as the function has not been changed"));
 
+static cl::opt<bool> EnableMergeFunctions(
+    "enable-merge-functions", cl::init(false), cl::Hidden,
+    cl::desc("Enable function merging as part of the optimization pipeline"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -187,7 +191,7 @@ PipelineTuningOptions::PipelineTuningOptions() {
   LicmMssaOptCap = SetLicmMssaOptCap;
   LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
   CallGraphProfile = true;
-  MergeFunctions = false;
+  MergeFunctions = EnableMergeFunctions;
   EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses;
 }
 
@@ -418,9 +422,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(CorrelatedValuePropagationPass());
 
   FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(InstCombinePass());
   if (Level == OptimizationLevel::O3)
     FPM.addPass(AggressiveInstCombinePass());
-  FPM.addPass(InstCombinePass());
 
   if (!Level.isOptimizingForSize())
     FPM.addPass(LibCallsShrinkWrapPass());
@@ -754,9 +758,11 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
   return MIWP;
 }
 
-ModuleInlinerPass
+ModulePassManager
 PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
                                         ThinOrFullLTOPhase Phase) {
+  ModulePassManager MPM;
+
   InlineParams IP = getInlineParamsFromOptLevel(Level);
   if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
       PGOOpt->Action == PGOOptions::SampleUse)
@@ -773,7 +779,16 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
   // inline deferral logic in module inliner.
   IP.EnableDeferral = false;
 
-  return ModuleInlinerPass(IP, UseInlineAdvisor);
+  MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor));
+
+  MPM.addPass(createModuleToFunctionPassAdaptor(
+      buildFunctionSimplificationPipeline(Level, Phase),
+      PTO.EagerlyInvalidateAnalyses));
+
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+      CoroSplitPass(Level != OptimizationLevel::O0)));
+
+  return MPM;
 }
 
 ModulePassManager
@@ -980,26 +995,28 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   FPM.addPass(InstCombinePass());
 
   if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
+    ExtraVectorPassManager ExtraPasses;
     // At higher optimization levels, try to clean up any runtime overlap and
     // alignment checks inserted by the vectorizer. We want to track correlated
     // runtime checks for two inner loops in the same outer loop, fold any
     // common computations, hoist loop-invariant aspects out of any outer loop,
     // and unswitch the runtime checks if possible. Once hoisted, we may have
     // dead (or speculatable) control flows or more combining opportunities.
-    FPM.addPass(EarlyCSEPass());
-    FPM.addPass(CorrelatedValuePropagationPass());
-    FPM.addPass(InstCombinePass());
+    ExtraPasses.addPass(EarlyCSEPass());
+    ExtraPasses.addPass(CorrelatedValuePropagationPass());
+    ExtraPasses.addPass(InstCombinePass());
     LoopPassManager LPM;
     LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
     LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
                                        OptimizationLevel::O3));
-    FPM.addPass(
+    ExtraPasses.addPass(
         RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
-    FPM.addPass(
+    ExtraPasses.addPass(
         createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true,
                                         /*UseBlockFrequencyInfo=*/true));
-    FPM.addPass(SimplifyCFGPass());
-    FPM.addPass(InstCombinePass());
+    ExtraPasses.addPass(SimplifyCFGPass());
+    ExtraPasses.addPass(InstCombinePass());
+    FPM.addPass(std::move(ExtraPasses));
   }
 
   // Now that we've formed fast to execute loop structures, we do further
@@ -1149,8 +1166,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // Disable header duplication at -Oz.
   LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink));
   // Some loops may have become dead by now. Try to delete them.
-  // FIXME: see disscussion in https://reviews.llvm.org/D112851
-  //        this may need to be revisited once GVN is more powerful.
+  // FIXME: see discussion in https://reviews.llvm.org/D112851,
+  //        this may need to be revisited once we run GVN before loop deletion
+  //        in the simplification pipeline.
   LPM.addPass(LoopDeletionPass());
   OptimizePM.addPass(createFunctionToLoopPassAdaptor(
       std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
@@ -1167,23 +1185,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
 
   addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
 
-  // Split out cold code. Splitting is done late to avoid hiding context from
-  // other optimizations and inadvertently regressing performance. The tradeoff
-  // is that this has a higher code size cost than splitting early.
-  if (EnableHotColdSplit && !LTOPreLink)
-    MPM.addPass(HotColdSplittingPass());
-
-  // Search the code for similar regions of code. If enough similar regions can
-  // be found where extracting the regions into their own function will decrease
-  // the size of the program, we extract the regions, a deduplicate the
-  // structurally similar regions.
-  if (EnableIROutliner)
-    MPM.addPass(IROutlinerPass());
-
-  // Merge functions if requested.
-  if (PTO.MergeFunctions)
-    MPM.addPass(MergeFunctionsPass());
-
   // LoopSink pass sinks instructions hoisted by LICM, which serves as a
   // canonicalization pass that enables other optimizations. As a result,
   // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
@@ -1211,6 +1212,23 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   for (auto &C : OptimizerLastEPCallbacks)
     C(MPM, Level);
 
+  // Split out cold code. Splitting is done late to avoid hiding context from
+  // other optimizations and inadvertently regressing performance. The tradeoff
+  // is that this has a higher code size cost than splitting early.
+  if (EnableHotColdSplit && !LTOPreLink)
+    MPM.addPass(HotColdSplittingPass());
+
+  // Search the code for similar regions of code. If enough similar regions can
+  // be found where extracting the regions into their own function will decrease
+  // the size of the program, we extract the regions, a deduplicate the
+  // structurally similar regions.
+  if (EnableIROutliner)
+    MPM.addPass(IROutlinerPass());
+
+  // Merge functions if requested.
+  if (PTO.MergeFunctions)
+    MPM.addPass(MergeFunctionsPass());
+
   if (PTO.CallGraphProfile)
     MPM.addPass(CGProfilePass());
 
@@ -1521,9 +1539,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // function pointers.  When this happens, we often have to resolve varargs
   // calls, etc, so let instcombine do this.
   FunctionPassManager PeepholeFPM;
+  PeepholeFPM.addPass(InstCombinePass());
   if (Level == OptimizationLevel::O3)
     PeepholeFPM.addPass(AggressiveInstCombinePass());
-  PeepholeFPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(PeepholeFPM, Level);
 
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM),
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index c2032b5b8276..74613a7fcce0 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -185,6 +185,7 @@ FUNCTION_ANALYSIS("aa", AAManager())
 FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis())
 FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis())
 FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis())
+FUNCTION_ANALYSIS("cycles", CycleAnalysis())
 FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis())
 FUNCTION_ANALYSIS("postdomtree", PostDominatorTreeAnalysis())
 FUNCTION_ANALYSIS("demanded-bits", DemandedBitsAnalysis())
@@ -202,6 +203,7 @@ FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis())
 FUNCTION_ANALYSIS("opt-remark-emit", OptimizationRemarkEmitterAnalysis())
 FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis())
 FUNCTION_ANALYSIS("should-not-run-function-passes", ShouldNotRunFunctionPassesAnalysis())
+FUNCTION_ANALYSIS("should-run-extra-vector-passes", ShouldRunExtraVectorPasses())
 FUNCTION_ANALYSIS("stack-safety-local", StackSafetyAnalysis())
 FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
 FUNCTION_ANALYSIS("targetir",
@@ -253,6 +255,7 @@ FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("dot-cfg", CFGPrinterPass())
 FUNCTION_PASS("dot-cfg-only", CFGOnlyPrinterPass())
 FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
+FUNCTION_PASS("flattencfg", FlattenCFGPass())
 FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass())
 FUNCTION_PASS("gvn-hoist", GVNHoistPass())
 FUNCTION_PASS("gvn-sink", GVNSinkPass())
@@ -303,6 +306,7 @@ FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
 FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(dbgs()))
 FUNCTION_PASS("print<branch-prob>", BranchProbabilityPrinterPass(dbgs()))
 FUNCTION_PASS("print<cost-model>", CostModelPrinterPass(dbgs()))
+FUNCTION_PASS("print<cycles>", CycleInfoPrinterPass(dbgs()))
 FUNCTION_PASS("print<da>", DependenceAnalysisPrinterPass(dbgs()))
 FUNCTION_PASS("print<divergence>", DivergenceAnalysisPrinterPass(dbgs()))
 FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs()))
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 27a6c519ff82..23c825c78713 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -1262,11 +1262,6 @@ void InLineChangePrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) {
 
 namespace {
 
-enum IRChangeDiffType { InBefore, InAfter, IsCommon, NumIRChangeDiffTypes };
-
-// Describe where a given element exists.
-std::string Colours[NumIRChangeDiffTypes];
-
 class DisplayNode;
 class DotCfgDiffDisplayGraph;
 
@@ -1274,19 +1269,19 @@ class DotCfgDiffDisplayGraph;
 class DisplayElement {
 public:
   // Is this in before, after, or both?
-  IRChangeDiffType getType() const { return Type; }
+  StringRef getColour() const { return Colour; }
 
 protected:
-  DisplayElement(IRChangeDiffType T) : Type(T) {}
-  const IRChangeDiffType Type;
+  DisplayElement(StringRef Colour) : Colour(Colour) {}
+  const StringRef Colour;
 };
 
 // An edge representing a transition between basic blocks in the
 // dot-cfg-changes graph.
 class DisplayEdge : public DisplayElement {
 public:
-  DisplayEdge(std::string V, DisplayNode &Node, IRChangeDiffType T)
-      : DisplayElement(T), Value(V), Node(Node) {}
+  DisplayEdge(std::string Value, DisplayNode &Node, StringRef Colour)
+      : DisplayElement(Colour), Value(Value), Node(Node) {}
   // The value on which the transition is made.
   std::string getValue() const { return Value; }
   // The node (representing a basic block) reached by this transition.
@@ -1302,8 +1297,8 @@ class DisplayNode : public DisplayElement {
 public:
   // \p C is the content for the node, \p T indicates the colour for the
   // outline of the node
-  DisplayNode(std::string C, IRChangeDiffType T)
-      : DisplayElement(T), Content(C) {}
+  DisplayNode(std::string Content, StringRef Colour)
+      : DisplayElement(Colour), Content(Content) {}
 
   // Iterator to the child nodes.  Required by GraphWriter.
   using ChildIterator = std::unordered_set<DisplayNode *>::const_iterator;
@@ -1315,13 +1310,13 @@ public:
   EdgeIterator edges_begin() const { return EdgePtrs.cbegin(); }
   EdgeIterator edges_end() const { return EdgePtrs.cend(); }
 
-  // Create an edge to \p Node on value \p V, with type \p T.
-  void createEdge(StringRef V, DisplayNode &Node, IRChangeDiffType T);
+  // Create an edge to \p Node on value \p Value, with colour \p Colour.
+  void createEdge(StringRef Value, DisplayNode &Node, StringRef Colour);
 
   // Return the content of this node.
   std::string getContent() const { return Content; }
 
-  // Return the type of the edge to node \p S.
+  // Return the edge to node \p S.
   const DisplayEdge &getEdge(const DisplayNode &To) const {
     assert(EdgeMap.find(&To) != EdgeMap.end() && "Expected to find edge.");
     return *EdgeMap.find(&To)->second;
@@ -1383,9 +1378,9 @@ public:
   }
 
   // Create a node.
-  void createNode(std::string C, IRChangeDiffType T) {
+  void createNode(std::string C, StringRef Colour) {
     assert(!NodeGenerationComplete && "Unexpected node creation");
-    Nodes.emplace_back(C, T);
+    Nodes.emplace_back(C, Colour);
   }
   // Return the node at index \p N to avoid problems with vectors reallocating.
   DisplayNode &getNode(unsigned N) {
@@ -1408,13 +1403,13 @@ public:
 
   // Return a string with colour information for Dot.  Required by GraphWriter.
   std::string getNodeAttributes(const DisplayNode &Node) const {
-    return attribute(Node.getType());
+    return attribute(Node.getColour());
   }
 
   // Return a string with colour information for Dot.  Required by GraphWriter.
   std::string getEdgeColorAttr(const DisplayNode &From,
                                const DisplayNode &To) const {
-    return attribute(From.getEdge(To).getType());
+    return attribute(From.getEdge(To).getColour());
   }
 
   // Get the starting basic block.  Required by GraphWriter.
@@ -1425,7 +1420,9 @@ public:
 
 protected:
   // Return the string containing the colour to use as a Dot attribute.
-  std::string attribute(IRChangeDiffType T) const;
+  std::string attribute(StringRef Colour) const {
+    return "color=" + Colour.str();
+  }
 
   bool NodeGenerationComplete = false;
   const std::string GraphName;
@@ -1434,10 +1431,10 @@ protected:
   DisplayNode *EntryNode = nullptr;
 };
 
-void DisplayNode::createEdge(StringRef V, DisplayNode &Node,
-                             IRChangeDiffType T) {
+void DisplayNode::createEdge(StringRef Value, DisplayNode &Node,
+                             StringRef Colour) {
   assert(!AllEdgesCreated && "Expected to be able to still create edges.");
-  Edges.emplace_back(V.str(), Node, T);
+  Edges.emplace_back(Value.str(), Node, Colour);
   Children.insert(&Node);
 }
 
@@ -1458,13 +1455,14 @@ public:
   DotCfgDiffNode() = delete;
 
   // Create a node in Dot difference graph \p G representing the basic block
-  // represented by \p BD with type \p T (where it exists).
+  // represented by \p BD with colour \p Colour (where it exists).
   DotCfgDiffNode(DotCfgDiff &G, unsigned N, const BlockDataT<DCData> &BD,
-                 IRChangeDiffType T)
-      : Graph(G), N(N), Data{&BD, nullptr}, Type(T) {}
+                 StringRef Colour)
+      : Graph(G), N(N), Data{&BD, nullptr}, Colour(Colour) {}
   DotCfgDiffNode(const DotCfgDiffNode &DN)
-      : Graph(DN.Graph), N(DN.N), Data{DN.Data[0], DN.Data[1]}, Type(DN.Type),
-        EdgesMap(DN.EdgesMap), Children(DN.Children), Edges(DN.Edges) {}
+      : Graph(DN.Graph), N(DN.N), Data{DN.Data[0], DN.Data[1]},
+        Colour(DN.Colour), EdgesMap(DN.EdgesMap), Children(DN.Children),
+        Edges(DN.Edges) {}
 
   unsigned getIndex() const { return N; }
 
@@ -1473,29 +1471,29 @@ public:
     assert(Data[0] && "Expected Data[0] to be set.");
     return Data[0]->getLabel();
   }
-  // Return where this block exists.
-  IRChangeDiffType getType() const { return Type; }
+  // Return the colour for this block
+  StringRef getColour() const { return Colour; }
   // Change this basic block from being only in before to being common.
   // Save the pointer to \p Other.
   void setCommon(const BlockDataT<DCData> &Other) {
     assert(!Data[1] && "Expected only one block datum");
     Data[1] = &Other;
-    Type = IsCommon;
+    Colour = CommonColour;
   }
-  // Add an edge to \p E of type {\p Value, \p T}.
-  void addEdge(unsigned E, StringRef Value, IRChangeDiffType T) {
+  // Add an edge to \p E of colour {\p Value, \p Colour}.
+  void addEdge(unsigned E, StringRef Value, StringRef Colour) {
     // This is a new edge or it is an edge being made common.
-    assert((EdgesMap.count(E) == 0 || T == IsCommon) &&
-           "Unexpected edge count and type.");
-    EdgesMap[E] = {Value.str(), T};
+    assert((EdgesMap.count(E) == 0 || Colour == CommonColour) &&
+           "Unexpected edge count and color.");
+    EdgesMap[E] = {Value.str(), Colour};
   }
   // Record the children and create edges.
   void finalize(DotCfgDiff &G);
 
-  // Return the type of the edge to node \p S.
-  std::pair<std::string, IRChangeDiffType> getEdge(const unsigned S) const {
+  // Return the colour of the edge to node \p S.
+  StringRef getEdgeColour(const unsigned S) const {
     assert(EdgesMap.count(S) == 1 && "Expected to find edge.");
-    return EdgesMap.at(S);
+    return EdgesMap.at(S).second;
   }
 
   // Return the string representing the basic block.
@@ -1508,8 +1506,8 @@ protected:
   DotCfgDiff &Graph;
   const unsigned N;
   const BlockDataT<DCData> *Data[2];
-  IRChangeDiffType Type;
-  std::map<const unsigned, std::pair<std::string, IRChangeDiffType>> EdgesMap;
+  StringRef Colour;
+  std::map<const unsigned, std::pair<std::string, StringRef>> EdgesMap;
   std::vector<unsigned> Children;
   std::vector<unsigned> Edges;
 };
@@ -1552,12 +1550,11 @@ public:
 
 protected:
   // Return the string surrounded by HTML to make it the appropriate colour.
-  std::string colourize(std::string S, IRChangeDiffType T) const;
+  std::string colourize(std::string S, StringRef Colour) const;
 
-  void createNode(StringRef Label, const BlockDataT<DCData> &BD,
-                  IRChangeDiffType T) {
+  void createNode(StringRef Label, const BlockDataT<DCData> &BD, StringRef C) {
     unsigned Pos = Nodes.size();
-    Nodes.emplace_back(*this, Pos, BD, T);
+    Nodes.emplace_back(*this, Pos, BD, C);
     NodePosition.insert({Label, Pos});
   }
 
@@ -1572,7 +1569,7 @@ protected:
 };
 
 std::string DotCfgDiffNode::getBodyContent() const {
-  if (Type == IsCommon) {
+  if (Colour == CommonColour) {
     assert(Data[1] && "Expected Data[1] to be set.");
 
     StringRef SR[2];
@@ -1586,11 +1583,11 @@ std::string DotCfgDiffNode::getBodyContent() const {
     }
 
     SmallString<80> OldLineFormat = formatv(
-        "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", Colours[InBefore]);
+        "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", BeforeColour);
     SmallString<80> NewLineFormat = formatv(
-        "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", Colours[InAfter]);
+        "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", AfterColour);
     SmallString<80> UnchangedLineFormat = formatv(
-        "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", Colours[IsCommon]);
+        "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", CommonColour);
     std::string Diff = Data[0]->getLabel().str();
     Diff += ":\n<BR align=\"left\"/>" +
             doSystemDiff(makeHTMLReady(SR[0]), makeHTMLReady(SR[1]),
@@ -1625,7 +1622,7 @@ std::string DotCfgDiffNode::getBodyContent() const {
   // drop predecessors as they can be big and are redundant
   BS1 = BS1.drop_until([](char C) { return C == '\n'; }).drop_front();
 
-  std::string S = "<FONT COLOR=\"" + Colours[Type] + "\">" + Label.str() + ":";
+  std::string S = "<FONT COLOR=\"" + Colour.str() + "\">" + Label.str() + ":";
 
   // align each line to the left.
   while (BS1.size()) {
@@ -1638,26 +1635,22 @@ std::string DotCfgDiffNode::getBodyContent() const {
   return S;
 }
 
-std::string DotCfgDiff::colourize(std::string S, IRChangeDiffType T) const {
+std::string DotCfgDiff::colourize(std::string S, StringRef Colour) const {
   if (S.length() == 0)
     return S;
-  return "<FONT COLOR=\"" + Colours[T] + "\">" + S + "</FONT>";
-}
-
-std::string DotCfgDiffDisplayGraph::attribute(IRChangeDiffType T) const {
-  return "color=" + Colours[T];
+  return "<FONT COLOR=\"" + Colour.str() + "\">" + S + "</FONT>";
 }
 
 DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before,
                        const FuncDataT<DCData> &After)
     : GraphName(Title.str()) {
-  StringMap<IRChangeDiffType> EdgesMap;
+  StringMap<StringRef> EdgesMap;
 
   // Handle each basic block in the before IR.
   for (auto &B : Before.getData()) {
     StringRef Label = B.getKey();
     const BlockDataT<DCData> &BD = B.getValue();
-    createNode(Label, BD, InBefore);
+    createNode(Label, BD, BeforeColour);
 
     // Create transitions with names made up of the from block label, the value
     // on which the transition is made and the to block label.
@@ -1666,7 +1659,7 @@ DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before,
          Sink != E; ++Sink) {
       std::string Key = (Label + " " + Sink->getKey().str()).str() + " " +
                         BD.getData().getSuccessorLabel(Sink->getKey()).str();
-      EdgesMap.insert({Key, InBefore});
+      EdgesMap.insert({Key, BeforeColour});
     }
   }
 
@@ -1677,7 +1670,7 @@ DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before,
     unsigned C = NodePosition.count(Label);
     if (C == 0)
       // This only exists in the after IR.  Create the node.
-      createNode(Label, BD, InAfter);
+      createNode(Label, BD, AfterColour);
     else {
       assert(C == 1 && "Unexpected multiple nodes.");
       Nodes[NodePosition[Label]].setCommon(BD);
@@ -1690,9 +1683,9 @@ DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before,
                         BD.getData().getSuccessorLabel(Sink->getKey()).str();
       unsigned C = EdgesMap.count(Key);
       if (C == 0)
-        EdgesMap.insert({Key, InAfter});
+        EdgesMap.insert({Key, AfterColour});
       else {
-        EdgesMap[Key] = IsCommon;
+        EdgesMap[Key] = CommonColour;
       }
     }
   }
@@ -1712,18 +1705,18 @@ DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before,
     DotCfgDiffNode &SourceNode = Nodes[NodePosition[Source]];
     assert(NodePosition.count(Sink) == 1 && "Expected to find node.");
     unsigned SinkNode = NodePosition[Sink];
-    IRChangeDiffType T = E.second;
+    StringRef Colour = E.second;
 
     // Look for an edge from Source to Sink
     if (EdgeLabels.count(SourceSink) == 0)
-      EdgeLabels.insert({SourceSink, colourize(Value.str(), T)});
+      EdgeLabels.insert({SourceSink, colourize(Value.str(), Colour)});
     else {
       StringRef V = EdgeLabels.find(SourceSink)->getValue();
-      std::string NV = colourize(V.str() + " " + Value.str(), T);
-      T = IsCommon;
+      std::string NV = colourize(V.str() + " " + Value.str(), Colour);
+      Colour = CommonColour;
       EdgeLabels[SourceSink] = NV;
     }
-    SourceNode.addEdge(SinkNode, Value, T);
+    SourceNode.addEdge(SinkNode, Value, Colour);
   }
   for (auto &I : Nodes)
     I.finalize(*this);
@@ -1744,7 +1737,7 @@ DotCfgDiffDisplayGraph DotCfgDiff::createDisplayGraph(StringRef Title,
   for (auto &I : Nodes) {
     if (I.getIndex() == Entry)
       EntryIndex = Index;
-    G.createNode(I.getBodyContent(), I.getType());
+    G.createNode(I.getBodyContent(), I.getColour());
     NodeMap.insert({I.getIndex(), Index++});
   }
   assert(EntryIndex >= 0 && "Expected entry node index to be set.");
@@ -1766,12 +1759,12 @@ void DotCfgDiffNode::createDisplayEdges(
 
   for (auto I : Edges) {
     unsigned SinkNodeIndex = I;
-    IRChangeDiffType Type = getEdge(SinkNodeIndex).second;
+    StringRef Colour = getEdgeColour(SinkNodeIndex);
     const DotCfgDiffNode *SinkNode = &Graph.getNode(SinkNodeIndex);
 
     StringRef Label = Graph.getEdgeSourceLabel(getIndex(), SinkNodeIndex);
     DisplayNode &SinkDisplayNode = DisplayGraph.getNode(SinkNode->getIndex());
-    SourceDisplayNode.createEdge(Label, SinkDisplayNode, Type);
+    SourceDisplayNode.createEdge(Label, SinkDisplayNode, Colour);
   }
   SourceDisplayNode.createEdgeMap();
 }
@@ -1891,12 +1884,7 @@ DCData::DCData(const BasicBlock &B) {
 }
 
 DotCfgChangeReporter::DotCfgChangeReporter(bool Verbose)
-    : ChangeReporter<IRDataT<DCData>>(Verbose) {
-  // Set up the colours based on the hidden options.
-  Colours[InBefore] = BeforeColour;
-  Colours[InAfter] = AfterColour;
-  Colours[IsCommon] = CommonColour;
-}
+    : ChangeReporter<IRDataT<DCData>>(Verbose) {}
 
 void DotCfgChangeReporter::handleFunctionCompare(
     StringRef Name, StringRef Prefix, StringRef PassID, StringRef Divider,
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index ab3487ecffe8..34e0c5ebcd58 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -110,6 +110,18 @@ static std::string getInstrProfErrString(instrprof_error Err,
   case instrprof_error::malformed:
     OS << "malformed instrumentation profile data";
     break;
+  case instrprof_error::missing_debug_info_for_correlation:
+    OS << "debug info for correlation is required";
+    break;
+  case instrprof_error::unexpected_debug_info_for_correlation:
+    OS << "debug info for correlation is not necessary";
+    break;
+  case instrprof_error::unable_to_correlate_profile:
+    OS << "unable to correlate profile";
+    break;
+  case instrprof_error::unsupported_debug_format:
+    OS << "unsupported debug info format (only DWARF is supported)";
+    break;
   case instrprof_error::invalid_prof:
     OS << "invalid profile created. Please file a bug "
           "at: " BUG_REPORT_URL
@@ -533,8 +545,8 @@ Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
 void InstrProfRecord::accumulateCounts(CountSumOrPercent &Sum) const {
   uint64_t FuncSum = 0;
   Sum.NumEntries += Counts.size();
-  for (size_t F = 0, E = Counts.size(); F < E; ++F)
-    FuncSum += Counts[F];
+  for (uint64_t Count : Counts)
+    FuncSum += Count;
   Sum.CountSum += FuncSum;
 
   for (uint32_t VK = IPVK_First; VK <= IPVK_Last; ++VK) {
@@ -674,9 +686,9 @@ void InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input,
 
 void InstrProfValueSiteRecord::scale(uint64_t N, uint64_t D,
                                      function_ref<void(instrprof_error)> Warn) {
-  for (auto I = ValueData.begin(), IE = ValueData.end(); I != IE; ++I) {
+  for (InstrProfValueData &I : ValueData) {
     bool Overflowed;
-    I->Count = SaturatingMultiply(I->Count, N, &Overflowed) / D;
+    I.Count = SaturatingMultiply(I.Count, N, &Overflowed) / D;
     if (Overflowed)
       Warn(instrprof_error::counter_overflow);
   }
@@ -1175,7 +1187,8 @@ bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) {
 // Create a COMDAT variable INSTR_PROF_RAW_VERSION_VAR to make the runtime
 // aware this is an ir_level profile so it can set the version flag.
 GlobalVariable *createIRLevelProfileFlagVar(Module &M, bool IsCS,
-                                            bool InstrEntryBBEnabled) {
+                                            bool InstrEntryBBEnabled,
+                                            bool DebugInfoCorrelate) {
   const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
   Type *IntTy64 = Type::getInt64Ty(M.getContext());
   uint64_t ProfileVersion = (INSTR_PROF_RAW_VERSION | VARIANT_MASK_IR_PROF);
@@ -1183,6 +1196,8 @@ GlobalVariable *createIRLevelProfileFlagVar(Module &M, bool IsCS,
     ProfileVersion |= VARIANT_MASK_CSIR_PROF;
   if (InstrEntryBBEnabled)
     ProfileVersion |= VARIANT_MASK_INSTR_ENTRY;
+  if (DebugInfoCorrelate)
+    ProfileVersion |= VARIANT_MASK_DBG_CORRELATE;
   auto IRLevelVersionVariable = new GlobalVariable(
       M, IntTy64, true, GlobalValue::WeakAnyLinkage,
       Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)), VarName);
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
new file mode 100644
index 000000000000..f9c113027da2
--- /dev/null
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -0,0 +1,264 @@
+//===-- InstrProfCorrelator.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/InstrProfCorrelator.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+
+#define DEBUG_TYPE "correlator"
+
+using namespace llvm;
+
+/// Get the __llvm_prf_cnts section.
+Expected<object::SectionRef> getCountersSection(const object::ObjectFile &Obj) {
+  for (auto &Section : Obj.sections())
+    if (auto SectionName = Section.getName())
+      if (SectionName.get() == INSTR_PROF_CNTS_SECT_NAME)
+        return Section;
+  return make_error<InstrProfError>(
+      instrprof_error::unable_to_correlate_profile);
+}
+
+const char *InstrProfCorrelator::FunctionNameAttributeName = "Function Name";
+const char *InstrProfCorrelator::CFGHashAttributeName = "CFG Hash";
+const char *InstrProfCorrelator::NumCountersAttributeName = "Num Counters";
+
+llvm::Expected<std::unique_ptr<InstrProfCorrelator::Context>>
+InstrProfCorrelator::Context::get(std::unique_ptr<MemoryBuffer> Buffer,
+                                  const object::ObjectFile &Obj) {
+  auto CountersSection = getCountersSection(Obj);
+  if (auto Err = CountersSection.takeError())
+    return std::move(Err);
+  auto C = std::make_unique<Context>();
+  C->Buffer = std::move(Buffer);
+  C->CountersSectionStart = CountersSection->getAddress();
+  C->CountersSectionEnd = C->CountersSectionStart + CountersSection->getSize();
+  C->ShouldSwapBytes = Obj.isLittleEndian() != sys::IsLittleEndianHost;
+  return Expected<std::unique_ptr<Context>>(std::move(C));
+}
+
+llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
+InstrProfCorrelator::get(StringRef DebugInfoFilename) {
+  auto DsymObjectsOrErr =
+      object::MachOObjectFile::findDsymObjectMembers(DebugInfoFilename);
+  if (auto Err = DsymObjectsOrErr.takeError())
+    return std::move(Err);
+  if (!DsymObjectsOrErr->empty()) {
+    // TODO: Enable profile correlation when there are multiple objects in a
+    // dSYM bundle.
+    if (DsymObjectsOrErr->size() > 1)
+      return createStringError(
+          std::error_code(),
+          "Profile correlation using multiple objects is not yet supported");
+    DebugInfoFilename = *DsymObjectsOrErr->begin();
+  }
+  auto BufferOrErr =
+      errorOrToExpected(MemoryBuffer::getFile(DebugInfoFilename));
+  if (auto Err = BufferOrErr.takeError())
+    return std::move(Err);
+
+  return get(std::move(*BufferOrErr));
+}
+
+llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
+InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer) {
+  auto BinOrErr = object::createBinary(*Buffer);
+  if (auto Err = BinOrErr.takeError())
+    return std::move(Err);
+
+  if (auto *Obj = dyn_cast<object::ObjectFile>(BinOrErr->get())) {
+    auto CtxOrErr = Context::get(std::move(Buffer), *Obj);
+    if (auto Err = CtxOrErr.takeError())
+      return std::move(Err);
+    auto T = Obj->makeTriple();
+    if (T.isArch64Bit())
+      return InstrProfCorrelatorImpl<uint64_t>::get(std::move(*CtxOrErr), *Obj);
+    if (T.isArch32Bit())
+      return InstrProfCorrelatorImpl<uint32_t>::get(std::move(*CtxOrErr), *Obj);
+  }
+  return make_error<InstrProfError>(
+      instrprof_error::unable_to_correlate_profile);
+}
+
+namespace llvm {
+
+template <>
+InstrProfCorrelatorImpl<uint32_t>::InstrProfCorrelatorImpl(
+    std::unique_ptr<InstrProfCorrelator::Context> Ctx)
+    : InstrProfCorrelatorImpl(InstrProfCorrelatorKind::CK_32Bit,
+                              std::move(Ctx)) {}
+template <>
+InstrProfCorrelatorImpl<uint64_t>::InstrProfCorrelatorImpl(
+    std::unique_ptr<InstrProfCorrelator::Context> Ctx)
+    : InstrProfCorrelatorImpl(InstrProfCorrelatorKind::CK_64Bit,
+                              std::move(Ctx)) {}
+template <>
+bool InstrProfCorrelatorImpl<uint32_t>::classof(const InstrProfCorrelator *C) {
+  return C->getKind() == InstrProfCorrelatorKind::CK_32Bit;
+}
+template <>
+bool InstrProfCorrelatorImpl<uint64_t>::classof(const InstrProfCorrelator *C) {
+  return C->getKind() == InstrProfCorrelatorKind::CK_64Bit;
+}
+
+} // end namespace llvm
+
+template <class IntPtrT>
+llvm::Expected<std::unique_ptr<InstrProfCorrelatorImpl<IntPtrT>>>
+InstrProfCorrelatorImpl<IntPtrT>::get(
+    std::unique_ptr<InstrProfCorrelator::Context> Ctx,
+    const object::ObjectFile &Obj) {
+  if (Obj.isELF() || Obj.isMachO()) {
+    auto DICtx = DWARFContext::create(Obj);
+    return std::make_unique<DwarfInstrProfCorrelator<IntPtrT>>(std::move(DICtx),
+                                                               std::move(Ctx));
+  }
+  return make_error<InstrProfError>(instrprof_error::unsupported_debug_format);
+}
+
+template <class IntPtrT>
+Error InstrProfCorrelatorImpl<IntPtrT>::correlateProfileData() {
+  assert(Data.empty() && CompressedNames.empty() && Names.empty());
+  correlateProfileDataImpl();
+  auto Result =
+      collectPGOFuncNameStrings(Names, /*doCompression=*/true, CompressedNames);
+  Names.clear();
+  return Result;
+}
+
+template <class IntPtrT>
+void InstrProfCorrelatorImpl<IntPtrT>::addProbe(StringRef FunctionName,
+                                                uint64_t CFGHash,
+                                                IntPtrT CounterOffset,
+                                                IntPtrT FunctionPtr,
+                                                uint32_t NumCounters) {
+  Data.push_back({
+      maybeSwap<uint64_t>(IndexedInstrProf::ComputeHash(FunctionName)),
+      maybeSwap<uint64_t>(CFGHash),
+      // In this mode, CounterPtr actually stores the section relative address
+      // of the counter.
+      maybeSwap<IntPtrT>(CounterOffset),
+      maybeSwap<IntPtrT>(FunctionPtr),
+      // TODO: Value profiling is not yet supported.
+      /*ValuesPtr=*/maybeSwap<IntPtrT>(0),
+      maybeSwap<uint32_t>(NumCounters),
+      /*NumValueSites=*/{maybeSwap<uint16_t>(0), maybeSwap<uint16_t>(0)},
+  });
+  Names.push_back(FunctionName.str());
+}
+
+template <class IntPtrT>
+llvm::Optional<uint64_t>
+DwarfInstrProfCorrelator<IntPtrT>::getLocation(const DWARFDie &Die) const {
+  auto Locations = Die.getLocations(dwarf::DW_AT_location);
+  if (!Locations) {
+    consumeError(Locations.takeError());
+    return {};
+  }
+  auto &DU = *Die.getDwarfUnit();
+  for (auto &Location : *Locations) {
+    auto AddressSize = DU.getAddressByteSize();
+    DataExtractor Data(Location.Expr, DICtx->isLittleEndian(), AddressSize);
+    DWARFExpression Expr(Data, AddressSize);
+    for (auto &Op : Expr)
+      if (Op.getCode() == dwarf::DW_OP_addr)
+        return Op.getRawOperand(0);
+  }
+  return {};
+}
+
+template <class IntPtrT>
+bool DwarfInstrProfCorrelator<IntPtrT>::isDIEOfProbe(const DWARFDie &Die) {
+  const auto &ParentDie = Die.getParent();
+  if (!Die.isValid() || !ParentDie.isValid() || Die.isNULL())
+    return false;
+  if (Die.getTag() != dwarf::DW_TAG_variable)
+    return false;
+  if (!ParentDie.isSubprogramDIE())
+    return false;
+  if (!Die.hasChildren())
+    return false;
+  if (const char *Name = Die.getName(DINameKind::ShortName))
+    return StringRef(Name).startswith(getInstrProfCountersVarPrefix());
+  return false;
+}
+
+template <class IntPtrT>
+void DwarfInstrProfCorrelator<IntPtrT>::correlateProfileDataImpl() {
+  auto maybeAddProbe = [&](DWARFDie Die) {
+    if (!isDIEOfProbe(Die))
+      return;
+    Optional<const char *> FunctionName;
+    Optional<uint64_t> CFGHash;
+    Optional<uint64_t> CounterPtr = getLocation(Die);
+    auto FunctionPtr =
+        dwarf::toAddress(Die.getParent().find(dwarf::DW_AT_low_pc));
+    Optional<uint64_t> NumCounters;
+    for (const DWARFDie &Child : Die.children()) {
+      if (Child.getTag() != dwarf::DW_TAG_LLVM_annotation)
+        continue;
+      auto AnnotationFormName = Child.find(dwarf::DW_AT_name);
+      auto AnnotationFormValue = Child.find(dwarf::DW_AT_const_value);
+      if (!AnnotationFormName || !AnnotationFormValue)
+        continue;
+      auto AnnotationNameOrErr = AnnotationFormName->getAsCString();
+      if (auto Err = AnnotationNameOrErr.takeError()) {
+        consumeError(std::move(Err));
+        continue;
+      }
+      StringRef AnnotationName = *AnnotationNameOrErr;
+      if (AnnotationName.compare(
+              InstrProfCorrelator::FunctionNameAttributeName) == 0) {
+        if (auto EC =
+                AnnotationFormValue->getAsCString().moveInto(FunctionName))
+          consumeError(std::move(EC));
+      } else if (AnnotationName.compare(
+                     InstrProfCorrelator::CFGHashAttributeName) == 0) {
+        CFGHash = AnnotationFormValue->getAsUnsignedConstant();
+      } else if (AnnotationName.compare(
+                     InstrProfCorrelator::NumCountersAttributeName) == 0) {
+        NumCounters = AnnotationFormValue->getAsUnsignedConstant();
+      }
+    }
+    if (!FunctionName || !CFGHash || !CounterPtr || !NumCounters) {
+      LLVM_DEBUG(dbgs() << "Incomplete DIE for probe\n\tFunctionName: "
+                        << FunctionName << "\n\tCFGHash: " << CFGHash
+                        << "\n\tCounterPtr: " << CounterPtr
+                        << "\n\tNumCounters: " << NumCounters);
+      LLVM_DEBUG(Die.dump(dbgs()));
+      return;
+    }
+    uint64_t CountersStart = this->Ctx->CountersSectionStart;
+    uint64_t CountersEnd = this->Ctx->CountersSectionEnd;
+    if (*CounterPtr < CountersStart || *CounterPtr >= CountersEnd) {
+      LLVM_DEBUG(
+          dbgs() << "CounterPtr out of range for probe\n\tFunction Name: "
+                 << FunctionName << "\n\tExpected: [0x"
+                 << Twine::utohexstr(CountersStart) << ", 0x"
+                 << Twine::utohexstr(CountersEnd) << ")\n\tActual: 0x"
+                 << Twine::utohexstr(*CounterPtr));
+      LLVM_DEBUG(Die.dump(dbgs()));
+      return;
+    }
+    if (!FunctionPtr) {
+      LLVM_DEBUG(dbgs() << "Could not find address of " << *FunctionName
+                        << "\n");
+      LLVM_DEBUG(Die.dump(dbgs()));
+    }
+    this->addProbe(*FunctionName, *CFGHash, *CounterPtr - CountersStart,
+                   FunctionPtr.getValueOr(0), *NumCounters);
+  };
+  for (auto &CU : DICtx->normal_units())
+    for (const auto &Entry : CU->dies())
+      maybeAddProbe(DWARFDie(CU.get(), &Entry));
+  for (auto &CU : DICtx->dwo_units())
+    for (const auto &Entry : CU->dies())
+      maybeAddProbe(DWARFDie(CU.get(), &Entry));
+}
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 885c1fe49240..37cdf4dd1fe2 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -52,16 +52,19 @@ static Error initializeReader(InstrProfReader &Reader) {
 }
 
 Expected<std::unique_ptr<InstrProfReader>>
-InstrProfReader::create(const Twine &Path) {
+InstrProfReader::create(const Twine &Path,
+                        const InstrProfCorrelator *Correlator) {
   // Set up the buffer to read.
   auto BufferOrError = setupMemoryBuffer(Path);
   if (Error E = BufferOrError.takeError())
     return std::move(E);
-  return InstrProfReader::create(std::move(BufferOrError.get()));
+  return InstrProfReader::create(std::move(BufferOrError.get()), Correlator);
 }
 
 Expected<std::unique_ptr<InstrProfReader>>
-InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
+InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer,
+                        const InstrProfCorrelator *Correlator) {
+  // Sanity check the buffer.
   if (uint64_t(Buffer->getBufferSize()) > std::numeric_limits<uint64_t>::max())
     return make_error<InstrProfError>(instrprof_error::too_large);
 
@@ -73,9 +76,9 @@ InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
   if (IndexedInstrProfReader::hasFormat(*Buffer))
     Result.reset(new IndexedInstrProfReader(std::move(Buffer)));
   else if (RawInstrProfReader64::hasFormat(*Buffer))
-    Result.reset(new RawInstrProfReader64(std::move(Buffer)));
+    Result.reset(new RawInstrProfReader64(std::move(Buffer), Correlator));
   else if (RawInstrProfReader32::hasFormat(*Buffer))
-    Result.reset(new RawInstrProfReader32(std::move(Buffer)));
+    Result.reset(new RawInstrProfReader32(std::move(Buffer), Correlator));
   else if (TextInstrProfReader::hasFormat(*Buffer))
     Result.reset(new TextInstrProfReader(std::move(Buffer)));
   else
@@ -352,7 +355,7 @@ Error RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
 
 template <class IntPtrT>
 Error RawInstrProfReader<IntPtrT>::createSymtab(InstrProfSymtab &Symtab) {
-  if (Error E = Symtab.create(StringRef(NamesStart, NamesSize)))
+  if (Error E = Symtab.create(StringRef(NamesStart, NamesEnd - NamesStart)))
     return error(std::move(E));
   for (const RawInstrProf::ProfileData<IntPtrT> *I = Data; I != DataEnd; ++I) {
     const IntPtrT FPtr = swap(I->FunctionPointer);
@@ -369,6 +372,10 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   Version = swap(Header.Version);
   if (GET_VERSION(Version) != RawInstrProf::Version)
     return error(instrprof_error::unsupported_version);
+  if (useDebugInfoCorrelate() && !Correlator)
+    return error(instrprof_error::missing_debug_info_for_correlation);
+  if (!useDebugInfoCorrelate() && Correlator)
+    return error(instrprof_error::unexpected_debug_info_for_correlation);
 
   BinaryIdsSize = swap(Header.BinaryIdsSize);
   if (BinaryIdsSize % sizeof(uint64_t))
@@ -380,7 +387,7 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   auto PaddingBytesBeforeCounters = swap(Header.PaddingBytesBeforeCounters);
   auto CountersSize = swap(Header.CountersSize);
   auto PaddingBytesAfterCounters = swap(Header.PaddingBytesAfterCounters);
-  NamesSize = swap(Header.NamesSize);
+  auto NamesSize = swap(Header.NamesSize);
   ValueKindLast = swap(Header.ValueKindLast);
 
   auto DataSizeInBytes = DataSize * sizeof(RawInstrProf::ProfileData<IntPtrT>);
@@ -398,15 +405,27 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   if (Start + ValueDataOffset > DataBuffer->getBufferEnd())
     return error(instrprof_error::bad_header);
 
-  Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>(
-      Start + DataOffset);
-  DataEnd = Data + DataSize;
+  if (Correlator) {
+    // These sizes in the raw file are zero because we constructed them in the
+    // Correlator.
+    assert(DataSize == 0 && NamesSize == 0);
+    assert(CountersDelta == 0 && NamesDelta == 0);
+    Data = Correlator->getDataPointer();
+    DataEnd = Data + Correlator->getDataSize();
+    NamesStart = Correlator->getCompressedNamesPointer();
+    NamesEnd = NamesStart + Correlator->getCompressedNamesSize();
+  } else {
+    Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>(
+        Start + DataOffset);
+    DataEnd = Data + DataSize;
+    NamesStart = Start + NamesOffset;
+    NamesEnd = NamesStart + NamesSize;
+  }
 
   // Binary ids start just after the header.
   BinaryIdsStart =
       reinterpret_cast<const uint8_t *>(&Header) + sizeof(RawInstrProf::Header);
   CountersStart = reinterpret_cast<const uint64_t *>(Start + CountersOffset);
-  NamesStart = Start + NamesOffset;
   ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset);
 
   const uint8_t *BufferEnd = (const uint8_t *)DataBuffer->getBufferEnd();
@@ -440,45 +459,50 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts(
   if (NumCounters == 0)
     return error(instrprof_error::malformed, "number of counters is zero");
 
-  IntPtrT CounterPtr = Data->CounterPtr;
-  auto *NamesStartAsCounter = reinterpret_cast<const uint64_t *>(NamesStart);
-  ptrdiff_t MaxNumCounters = NamesStartAsCounter - CountersStart;
-
-  // Check bounds. Note that the counter pointer embedded in the data record
-  // may itself be corrupt.
-  if (MaxNumCounters < 0 || NumCounters > (uint32_t)MaxNumCounters)
-    return error(instrprof_error::malformed,
-                 "counter pointer is out of bounds");
-
-  // We need to compute the in-buffer counter offset from the in-memory address
-  // distance. The initial CountersDelta is the in-memory address difference
-  // start(__llvm_prf_cnts)-start(__llvm_prf_data), so SrcData->CounterPtr -
-  // CountersDelta computes the offset into the in-buffer counter section.
-  //
-  // CountersDelta decreases as we advance to the next data record.
-  ptrdiff_t CounterOffset = getCounterOffset(CounterPtr);
-  CountersDelta -= sizeof(*Data);
-  if (CounterOffset < 0)
-    return error(
-        instrprof_error::malformed,
-        ("counter offset " + Twine(CounterOffset) + " is negative").str());
+  ArrayRef<uint64_t> RawCounts;
+  if (Correlator) {
+    uint64_t CounterOffset = swap<IntPtrT>(Data->CounterPtr) / sizeof(uint64_t);
+    RawCounts =
+        makeArrayRef<uint64_t>(CountersStart + CounterOffset, NumCounters);
+  } else {
+    IntPtrT CounterPtr = Data->CounterPtr;
+    ptrdiff_t CounterOffset = getCounterOffset(CounterPtr);
+    if (CounterOffset < 0)
+      return error(
+          instrprof_error::malformed,
+          ("counter offset " + Twine(CounterOffset) + " is negative").str());
 
-  if (CounterOffset > MaxNumCounters)
-    return error(instrprof_error::malformed,
-                 ("counter offset " + Twine(CounterOffset) +
-                  " is greater than the maximum number of counters " +
-                  Twine((uint32_t)MaxNumCounters))
-                     .str());
+    // Check bounds. Note that the counter pointer embedded in the data record
+    // may itself be corrupt.
+    auto *NamesStartAsCounter = reinterpret_cast<const uint64_t *>(NamesStart);
+    ptrdiff_t MaxNumCounters = NamesStartAsCounter - CountersStart;
+    if (MaxNumCounters < 0 || NumCounters > (uint32_t)MaxNumCounters)
+      return error(instrprof_error::malformed,
+                   "counter pointer is out of bounds");
+    // We need to compute the in-buffer counter offset from the in-memory
+    // address distance. The initial CountersDelta is the in-memory address
+    // difference start(__llvm_prf_cnts)-start(__llvm_prf_data), so
+    // SrcData->CounterPtr - CountersDelta computes the offset into the
+    // in-buffer counter section.
+    if (CounterOffset > MaxNumCounters)
+      return error(instrprof_error::malformed,
+                   ("counter offset " + Twine(CounterOffset) +
+                    " is greater than the maximum number of counters " +
+                    Twine((uint32_t)MaxNumCounters))
+                       .str());
 
-  if (((uint32_t)CounterOffset + NumCounters) > (uint32_t)MaxNumCounters)
-    return error(instrprof_error::malformed,
-                 ("number of counters " +
-                  Twine(((uint32_t)CounterOffset + NumCounters)) +
-                  " is greater than the maximum number of counters " +
-                  Twine((uint32_t)MaxNumCounters))
-                     .str());
+    if (((uint32_t)CounterOffset + NumCounters) > (uint32_t)MaxNumCounters)
+      return error(instrprof_error::malformed,
+                   ("number of counters " +
+                    Twine(((uint32_t)CounterOffset + NumCounters)) +
+                    " is greater than the maximum number of counters " +
+                    Twine((uint32_t)MaxNumCounters))
+                       .str());
+    // CountersDelta decreases as we advance to the next data record.
+    CountersDelta -= sizeof(*Data);
 
-  auto RawCounts = makeArrayRef(getCounter(CounterOffset), NumCounters);
+    RawCounts = makeArrayRef(getCounter(CounterOffset), NumCounters);
+  }
 
   if (ShouldSwapBytes) {
     Record.Counts.clear();
@@ -977,11 +1001,10 @@ IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
   if (Err)
     return std::move(Err);
   // Found it. Look for counters with the right hash.
-  for (unsigned I = 0, E = Data.size(); I < E; ++I) {
+  for (const NamedInstrProfRecord &I : Data) {
     // Check for a match and fill the vector if there is one.
-    if (Data[I].Hash == FuncHash) {
-      return std::move(Data[I]);
-    }
+    if (I.Hash == FuncHash)
+      return std::move(I);
   }
   return error(instrprof_error::hash_mismatch);
 }
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 492e3541cb5a..6628eea80640 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -32,6 +32,7 @@
 #include <vector>
 
 using namespace llvm;
+extern cl::opt<bool> DebugInfoCorrelate;
 
 // A struct to define how the data stream should be patched. For Indexed
 // profiling, only uint64_t data type is needed.
diff --git a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
index f54df7b295e3..bbb640cfaee8 100644
--- a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
+++ b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
@@ -194,7 +194,7 @@ SampleProfileSummaryBuilder::computeSummaryForProfiles(
   // more function profiles each with lower counts, which in turn leads to lower
   // hot thresholds. To compensate for that, by default we merge context
   // profiles before computing profile summary.
-  if (UseContextLessSummary || (sampleprof::FunctionSamples::ProfileIsCS &&
+  if (UseContextLessSummary || (sampleprof::FunctionSamples::ProfileIsCSFlat &&
                                 !UseContextLessSummary.getNumOccurrences())) {
     for (const auto &I : Profiles) {
       ContextLessProfiles[I.second.getName()].merge(I.second);
diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp
index fd8fd3b675b7..9b01a386a360 100644
--- a/llvm/lib/ProfileData/SampleProf.cpp
+++ b/llvm/lib/ProfileData/SampleProf.cpp
@@ -35,11 +35,18 @@ static cl::opt<uint64_t> ProfileSymbolListCutOff(
     cl::desc("Cutoff value about how many symbols in profile symbol list "
              "will be used. This is very useful for performance debugging"));
 
+cl::opt<bool> GenerateMergedBaseProfiles(
+    "generate-merged-base-profiles", cl::init(true), cl::ZeroOrMore,
+    cl::desc("When generating nested context-sensitive profiles, always "
+             "generate extra base profile for function with all its context "
+             "profiles merged into it."));
+
 namespace llvm {
 namespace sampleprof {
 SampleProfileFormat FunctionSamples::Format;
 bool FunctionSamples::ProfileIsProbeBased = false;
-bool FunctionSamples::ProfileIsCS = false;
+bool FunctionSamples::ProfileIsCSFlat = false;
+bool FunctionSamples::ProfileIsCSNested = false;
 bool FunctionSamples::UseMD5 = false;
 bool FunctionSamples::HasUniqSuffix = true;
 bool FunctionSamples::ProfileIsFS = false;
@@ -218,8 +225,9 @@ unsigned FunctionSamples::getOffset(const DILocation *DIL) {
       0xffff;
 }
 
-LineLocation FunctionSamples::getCallSiteIdentifier(const DILocation *DIL) {
-  if (FunctionSamples::ProfileIsProbeBased)
+LineLocation FunctionSamples::getCallSiteIdentifier(const DILocation *DIL,
+                                                    bool ProfileIsFS) {
+  if (FunctionSamples::ProfileIsProbeBased) {
     // In a pseudo-probe based profile, a callsite is simply represented by the
     // ID of the probe associated with the call instruction. The probe ID is
     // encoded in the Discriminator field of the call instruction's debug
@@ -227,9 +235,19 @@ LineLocation FunctionSamples::getCallSiteIdentifier(const DILocation *DIL) {
     return LineLocation(PseudoProbeDwarfDiscriminator::extractProbeIndex(
                             DIL->getDiscriminator()),
                         0);
-  else
-    return LineLocation(FunctionSamples::getOffset(DIL),
-                        DIL->getBaseDiscriminator());
+  } else {
+    unsigned Discriminator =
+        ProfileIsFS ? DIL->getDiscriminator() : DIL->getBaseDiscriminator();
+    return LineLocation(FunctionSamples::getOffset(DIL), Discriminator);
+  }
+}
+
+uint64_t FunctionSamples::getCallSiteHash(StringRef CalleeName,
+                                          const LineLocation &Callsite) {
+  uint64_t NameHash = std::hash<std::string>{}(CalleeName.str());
+  uint64_t LocId =
+      (((uint64_t)Callsite.LineOffset) << 32) | Callsite.Discriminator;
+  return NameHash + (LocId << 5) + LocId;
 }
 
 const FunctionSamples *FunctionSamples::findFunctionSamples(
@@ -239,21 +257,16 @@ const FunctionSamples *FunctionSamples::findFunctionSamples(
 
   const DILocation *PrevDIL = DIL;
   for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
-    unsigned Discriminator;
-    if (ProfileIsFS)
-      Discriminator = DIL->getDiscriminator();
-    else
-      Discriminator = DIL->getBaseDiscriminator();
-
     // Use C++ linkage name if possible.
     StringRef Name = PrevDIL->getScope()->getSubprogram()->getLinkageName();
     if (Name.empty())
       Name = PrevDIL->getScope()->getSubprogram()->getName();
-
-    S.push_back(
-        std::make_pair(LineLocation(getOffset(DIL), Discriminator), Name));
+    S.emplace_back(FunctionSamples::getCallSiteIdentifier(
+                       DIL, FunctionSamples::ProfileIsFS),
+                   Name);
     PrevDIL = DIL;
   }
+
   if (S.size() == 0)
     return this;
   const FunctionSamples *FS = this;
@@ -454,3 +467,81 @@ void ProfileSymbolList::dump(raw_ostream &OS) const {
   for (auto &Sym : SortedList)
     OS << Sym << "\n";
 }
+
+CSProfileConverter::FrameNode *
+CSProfileConverter::FrameNode::getOrCreateChildFrame(
+    const LineLocation &CallSite, StringRef CalleeName) {
+  uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite);
+  auto It = AllChildFrames.find(Hash);
+  if (It != AllChildFrames.end()) {
+    assert(It->second.FuncName == CalleeName &&
+           "Hash collision for child context node");
+    return &It->second;
+  }
+
+  AllChildFrames[Hash] = FrameNode(CalleeName, nullptr, CallSite);
+  return &AllChildFrames[Hash];
+}
+
+CSProfileConverter::CSProfileConverter(SampleProfileMap &Profiles)
+    : ProfileMap(Profiles) {
+  for (auto &FuncSample : Profiles) {
+    FunctionSamples *FSamples = &FuncSample.second;
+    auto *NewNode = getOrCreateContextPath(FSamples->getContext());
+    assert(!NewNode->FuncSamples && "New node cannot have sample profile");
+    NewNode->FuncSamples = FSamples;
+  }
+}
+
+CSProfileConverter::FrameNode *
+CSProfileConverter::getOrCreateContextPath(const SampleContext &Context) {
+  auto Node = &RootFrame;
+  LineLocation CallSiteLoc(0, 0);
+  for (auto &Callsite : Context.getContextFrames()) {
+    Node = Node->getOrCreateChildFrame(CallSiteLoc, Callsite.FuncName);
+    CallSiteLoc = Callsite.Location;
+  }
+  return Node;
+}
+
+void CSProfileConverter::convertProfiles(CSProfileConverter::FrameNode &Node) {
+  // Process each child profile. Add each child profile to callsite profile map
+  // of the current node `Node` if `Node` comes with a profile. Otherwise
+  // promote the child profile to a standalone profile.
+  auto *NodeProfile = Node.FuncSamples;
+  for (auto &It : Node.AllChildFrames) {
+    auto &ChildNode = It.second;
+    convertProfiles(ChildNode);
+    auto *ChildProfile = ChildNode.FuncSamples;
+    if (!ChildProfile)
+      continue;
+    SampleContext OrigChildContext = ChildProfile->getContext();
+    // Reset the child context to be contextless.
+    ChildProfile->getContext().setName(OrigChildContext.getName());
+    if (NodeProfile) {
+      // Add child profile to the callsite profile map.
+      auto &SamplesMap = NodeProfile->functionSamplesAt(ChildNode.CallSiteLoc);
+      SamplesMap.emplace(OrigChildContext.getName().str(), *ChildProfile);
+      NodeProfile->addTotalSamples(ChildProfile->getTotalSamples());
+    }
+
+    // Separate child profile to be a standalone profile, if the current parent
+    // profile doesn't exist. This is a duplicating operation when the child
+    // profile is already incorporated into the parent which is still useful and
+    // thus done optionally. It is seen that duplicating context profiles into
+    // base profiles improves the code quality for thinlto build by allowing a
+    // profile in the prelink phase for to-be-fully-inlined functions.
+    if (!NodeProfile || GenerateMergedBaseProfiles)
+      ProfileMap[ChildProfile->getContext()].merge(*ChildProfile);
+
+    // Contexts coming with a `ContextShouldBeInlined` attribute indicate this
+    // is a preinliner-computed profile.
+    if (OrigChildContext.hasAttribute(ContextShouldBeInlined))
+      FunctionSamples::ProfileIsCSNested = true;
+
+    // Remove the original child profile.
+    ProfileMap.erase(OrigChildContext);
+  }
+}
+
+void CSProfileConverter::convertProfiles() { convertProfiles(RootFrame); }
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index eefb7c2ba627..da16309fb82c 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -146,7 +146,7 @@ static bool ParseLine(const StringRef &Input, LineType &LineTy, uint32_t &Depth,
   if (Depth == 0)
     return false;
 
-  if (Depth == 1 && Input[Depth] == '!') {
+  if (Input[Depth] == '!') {
     LineTy = LineType::Metadata;
     return parseMetadata(Input.substr(Depth), FunctionHash, Attributes);
   }
@@ -244,11 +244,11 @@ std::error_code SampleProfileReaderText::readImpl() {
   sampleprof_error Result = sampleprof_error::success;
 
   InlineCallStack InlineStack;
-  uint32_t ProbeProfileCount = 0;
+  uint32_t TopLevelProbeProfileCount = 0;
 
-  // SeenMetadata tracks whether we have processed metadata for the current
-  // top-level function profile.
-  bool SeenMetadata = false;
+  // DepthMetadata tracks whether we have processed metadata for the current
+  // top-level or nested function profile.
+  uint32_t DepthMetadata = 0;
 
   ProfileIsFS = ProfileIsFSDisciminator;
   FunctionSamples::ProfileIsFS = ProfileIsFS;
@@ -275,7 +275,7 @@ std::error_code SampleProfileReaderText::readImpl() {
                     "Expected 'mangled_name:NUM:NUM', found " + *LineIt);
         return sampleprof_error::malformed;
       }
-      SeenMetadata = false;
+      DepthMetadata = 0;
       SampleContext FContext(FName, CSNameTable);
       if (FContext.hasContext())
         ++CSProfileCount;
@@ -302,7 +302,7 @@ std::error_code SampleProfileReaderText::readImpl() {
                         *LineIt);
         return sampleprof_error::malformed;
       }
-      if (SeenMetadata && LineTy != LineType::Metadata) {
+      if (LineTy != LineType::Metadata && Depth == DepthMetadata) {
         // Metadata must be put at the end of a function profile.
         reportError(LineIt.line_number(),
                     "Found non-metadata after metadata: " + *LineIt);
@@ -322,6 +322,7 @@ std::error_code SampleProfileReaderText::readImpl() {
         FSamples.setName(FName);
         MergeResult(Result, FSamples.addTotalSamples(NumSamples));
         InlineStack.push_back(&FSamples);
+        DepthMetadata = 0;
         break;
       }
       case LineType::BodyProfile: {
@@ -342,11 +343,13 @@ std::error_code SampleProfileReaderText::readImpl() {
         FunctionSamples &FProfile = *InlineStack.back();
         if (FunctionHash) {
           FProfile.setFunctionHash(FunctionHash);
-          ++ProbeProfileCount;
+          if (Depth == 1)
+            ++TopLevelProbeProfileCount;
         }
-        if (Attributes)
-          FProfile.getContext().setAllAttributes(Attributes);
-        SeenMetadata = true;
+        FProfile.getContext().setAllAttributes(Attributes);
+        if (Attributes & (uint32_t)ContextShouldBeInlined)
+          ProfileIsCSNested = true;
+        DepthMetadata = Depth;
         break;
       }
       }
@@ -355,12 +358,14 @@ std::error_code SampleProfileReaderText::readImpl() {
 
   assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) &&
          "Cannot have both context-sensitive and regular profile");
-  ProfileIsCS = (CSProfileCount > 0);
-  assert((ProbeProfileCount == 0 || ProbeProfileCount == Profiles.size()) &&
+  ProfileIsCSFlat = (CSProfileCount > 0);
+  assert((TopLevelProbeProfileCount == 0 ||
+          TopLevelProbeProfileCount == Profiles.size()) &&
          "Cannot have both probe-based profiles and regular profiles");
-  ProfileIsProbeBased = (ProbeProfileCount > 0);
+  ProfileIsProbeBased = (TopLevelProbeProfileCount > 0);
   FunctionSamples::ProfileIsProbeBased = ProfileIsProbeBased;
-  FunctionSamples::ProfileIsCS = ProfileIsCS;
+  FunctionSamples::ProfileIsCSFlat = ProfileIsCSFlat;
+  FunctionSamples::ProfileIsCSNested = ProfileIsCSNested;
 
   if (Result == sampleprof_error::success)
     computeSummary();
@@ -625,7 +630,7 @@ SampleProfileReaderExtBinaryBase::readContextFromTable() {
 
 ErrorOr<SampleContext>
 SampleProfileReaderExtBinaryBase::readSampleContextFromTable() {
-  if (ProfileIsCS) {
+  if (ProfileIsCSFlat) {
     auto FContext(readContextFromTable());
     if (std::error_code EC = FContext.getError())
       return EC;
@@ -649,7 +654,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagPartial))
       Summary->setPartialProfile(true);
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext))
-      FunctionSamples::ProfileIsCS = ProfileIsCS = true;
+      FunctionSamples::ProfileIsCSFlat = ProfileIsCSFlat = true;
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
       FunctionSamples::ProfileIsFS = ProfileIsFS = true;
     break;
@@ -683,6 +688,9 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
     ProfileIsProbeBased =
         hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsProbeBased);
     FunctionSamples::ProfileIsProbeBased = ProfileIsProbeBased;
+    ProfileIsCSNested =
+        hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsCSNested);
+    FunctionSamples::ProfileIsCSNested = ProfileIsCSNested;
     bool HasAttribute =
         hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagHasAttribute);
     if (std::error_code EC = readFuncMetadata(HasAttribute))
@@ -770,7 +778,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
       }
     }
 
-    if (ProfileIsCS) {
+    if (ProfileIsCSFlat) {
       DenseSet<uint64_t> FuncGuidsToUse;
       if (useMD5()) {
         for (auto Name : FuncsToUse)
@@ -840,7 +848,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
   }
   assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) &&
          "Cannot have both context-sensitive and regular profile");
-  assert((!CSProfileCount || ProfileIsCS) &&
+  assert((!CSProfileCount || ProfileIsCSFlat) &&
          "Section flag should be consistent with actual profile");
   return sampleprof_error::success;
 }
@@ -1078,30 +1086,77 @@ std::error_code SampleProfileReaderExtBinaryBase::readCSNameTableSec() {
 }
 
 std::error_code
-SampleProfileReaderExtBinaryBase::readFuncMetadata(bool ProfileHasAttribute) {
-  while (Data < End) {
-    auto FContext(readSampleContextFromTable());
-    if (std::error_code EC = FContext.getError())
-      return EC;
 
-    bool ProfileInMap = Profiles.count(*FContext);
+SampleProfileReaderExtBinaryBase::readFuncMetadata(bool ProfileHasAttribute,
+                                                   FunctionSamples *FProfile) {
+  if (Data < End) {
     if (ProfileIsProbeBased) {
       auto Checksum = readNumber<uint64_t>();
       if (std::error_code EC = Checksum.getError())
         return EC;
-      if (ProfileInMap)
-        Profiles[*FContext].setFunctionHash(*Checksum);
+      if (FProfile)
+        FProfile->setFunctionHash(*Checksum);
     }
 
     if (ProfileHasAttribute) {
       auto Attributes = readNumber<uint32_t>();
       if (std::error_code EC = Attributes.getError())
         return EC;
-      if (ProfileInMap)
-        Profiles[*FContext].getContext().setAllAttributes(*Attributes);
+      if (FProfile)
+        FProfile->getContext().setAllAttributes(*Attributes);
+    }
+
+    if (!ProfileIsCSFlat) {
+      // Read all the attributes for inlined function calls.
+      auto NumCallsites = readNumber<uint32_t>();
+      if (std::error_code EC = NumCallsites.getError())
+        return EC;
+
+      for (uint32_t J = 0; J < *NumCallsites; ++J) {
+        auto LineOffset = readNumber<uint64_t>();
+        if (std::error_code EC = LineOffset.getError())
+          return EC;
+
+        auto Discriminator = readNumber<uint64_t>();
+        if (std::error_code EC = Discriminator.getError())
+          return EC;
+
+        auto FContext(readSampleContextFromTable());
+        if (std::error_code EC = FContext.getError())
+          return EC;
+
+        FunctionSamples *CalleeProfile = nullptr;
+        if (FProfile) {
+          CalleeProfile = const_cast<FunctionSamples *>(
+              &FProfile->functionSamplesAt(LineLocation(
+                  *LineOffset,
+                  *Discriminator))[std::string(FContext.get().getName())]);
+        }
+        if (std::error_code EC =
+                readFuncMetadata(ProfileHasAttribute, CalleeProfile))
+          return EC;
+      }
     }
   }
 
+  return sampleprof_error::success;
+}
+
+std::error_code
+SampleProfileReaderExtBinaryBase::readFuncMetadata(bool ProfileHasAttribute) {
+  while (Data < End) {
+    auto FContext(readSampleContextFromTable());
+    if (std::error_code EC = FContext.getError())
+      return EC;
+    FunctionSamples *FProfile = nullptr;
+    auto It = Profiles.find(*FContext);
+    if (It != Profiles.end())
+      FProfile = &It->second;
+
+    if (std::error_code EC = readFuncMetadata(ProfileHasAttribute, FProfile))
+      return EC;
+  }
+
   assert(Data == End && "More data is read than expected");
   return sampleprof_error::success;
 }
@@ -1233,6 +1288,8 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) {
       Flags.append("probe,");
     if (hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagHasAttribute))
       Flags.append("attr,");
+    if (hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsCSNested))
+      Flags.append("preinlined,");
     break;
   default:
     break;
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 78006aab1541..6f02bd203a9f 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -172,7 +172,7 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncOffsetTable() {
     return (std::error_code)sampleprof_error::success;
   };
 
-  if (FunctionSamples::ProfileIsCS) {
+  if (FunctionSamples::ProfileIsCSFlat) {
     // Sort the contexts before writing them out. This is to help fast load all
     // context profiles for a function as well as their callee contexts which
     // can help profile-guided importing for ThinLTO.
@@ -195,17 +195,45 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncOffsetTable() {
 }
 
 std::error_code SampleProfileWriterExtBinaryBase::writeFuncMetadata(
+    const FunctionSamples &FunctionProfile) {
+  auto &OS = *OutputStream;
+  if (std::error_code EC = writeContextIdx(FunctionProfile.getContext()))
+    return EC;
+
+  if (FunctionSamples::ProfileIsProbeBased)
+    encodeULEB128(FunctionProfile.getFunctionHash(), OS);
+  if (FunctionSamples::ProfileIsCSFlat || FunctionSamples::ProfileIsCSNested) {
+    encodeULEB128(FunctionProfile.getContext().getAllAttributes(), OS);
+  }
+
+  if (!FunctionSamples::ProfileIsCSFlat) {
+    // Recursively emit attributes for all callee samples.
+    uint64_t NumCallsites = 0;
+    for (const auto &J : FunctionProfile.getCallsiteSamples())
+      NumCallsites += J.second.size();
+    encodeULEB128(NumCallsites, OS);
+    for (const auto &J : FunctionProfile.getCallsiteSamples()) {
+      for (const auto &FS : J.second) {
+        LineLocation Loc = J.first;
+        encodeULEB128(Loc.LineOffset, OS);
+        encodeULEB128(Loc.Discriminator, OS);
+        if (std::error_code EC = writeFuncMetadata(FS.second))
+          return EC;
+      }
+    }
+  }
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileWriterExtBinaryBase::writeFuncMetadata(
     const SampleProfileMap &Profiles) {
-  if (!FunctionSamples::ProfileIsProbeBased && !FunctionSamples::ProfileIsCS)
+  if (!FunctionSamples::ProfileIsProbeBased &&
+      !FunctionSamples::ProfileIsCSFlat && !FunctionSamples::ProfileIsCSNested)
     return sampleprof_error::success;
-  auto &OS = *OutputStream;
   for (const auto &Entry : Profiles) {
-    if (std::error_code EC = writeContextIdx(Entry.second.getContext()))
+    if (std::error_code EC = writeFuncMetadata(Entry.second))
       return EC;
-    if (FunctionSamples::ProfileIsProbeBased)
-      encodeULEB128(Entry.second.getFunctionHash(), OS);
-    if (FunctionSamples::ProfileIsCS)
-      encodeULEB128(Entry.second.getContext().getAllAttributes(), OS);
   }
   return sampleprof_error::success;
 }
@@ -295,10 +323,13 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection(
     setToCompressSection(SecProfileSymbolList);
   if (Type == SecFuncMetadata && FunctionSamples::ProfileIsProbeBased)
     addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsProbeBased);
-  if (Type == SecProfSummary && FunctionSamples::ProfileIsCS)
-    addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFullContext);
-  if (Type == SecFuncMetadata && FunctionSamples::ProfileIsCS)
+  if (Type == SecFuncMetadata && FunctionSamples::ProfileIsCSNested)
+    addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsCSNested);
+  if (Type == SecFuncMetadata &&
+      (FunctionSamples::ProfileIsCSFlat || FunctionSamples::ProfileIsCSNested))
     addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagHasAttribute);
+  if (Type == SecProfSummary && FunctionSamples::ProfileIsCSFlat)
+    addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFullContext);
   if (Type == SecProfSummary && FunctionSamples::ProfileIsFS)
     addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFSDiscriminator);
 
@@ -440,7 +471,7 @@ SampleProfileWriterCompactBinary::write(const SampleProfileMap &ProfileMap) {
 /// it needs to be parsed by the SampleProfileReaderText class.
 std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
   auto &OS = *OutputStream;
-  if (FunctionSamples::ProfileIsCS)
+  if (FunctionSamples::ProfileIsCSFlat)
     OS << "[" << S.getContext().toString() << "]:" << S.getTotalSamples();
   else
     OS << S.getName() << ":" << S.getTotalSamples();
@@ -483,15 +514,14 @@ std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
     }
   Indent -= 1;
 
-  if (Indent == 0) {
-    if (FunctionSamples::ProfileIsProbeBased) {
-      OS.indent(Indent + 1);
-      OS << "!CFGChecksum: " << S.getFunctionHash() << "\n";
-    }
-    if (FunctionSamples::ProfileIsCS) {
-      OS.indent(Indent + 1);
-      OS << "!Attributes: " << S.getContext().getAllAttributes() << "\n";
-    }
+  if (FunctionSamples::ProfileIsProbeBased) {
+    OS.indent(Indent + 1);
+    OS << "!CFGChecksum: " << S.getFunctionHash() << "\n";
+  }
+
+  if (S.getContext().getAllAttributes()) {
+    OS.indent(Indent + 1);
+    OS << "!Attributes: " << S.getContext().getAllAttributes() << "\n";
   }
 
   return sampleprof_error::success;
@@ -841,7 +871,8 @@ SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
   std::unique_ptr<SampleProfileWriter> Writer;
 
   // Currently only Text and Extended Binary format are supported for CSSPGO.
-  if ((FunctionSamples::ProfileIsCS || FunctionSamples::ProfileIsProbeBased) &&
+  if ((FunctionSamples::ProfileIsCSFlat ||
+       FunctionSamples::ProfileIsProbeBased) &&
       (Format == SPF_Binary || Format == SPF_Compact_Binary))
     return sampleprof_error::unsupported_writing_format;
 
diff --git a/llvm/lib/Support/AArch64TargetParser.cpp b/llvm/lib/Support/AArch64TargetParser.cpp
index a3e41ccd199c..4bc9c8487131 100644
--- a/llvm/lib/Support/AArch64TargetParser.cpp
+++ b/llvm/lib/Support/AArch64TargetParser.cpp
@@ -240,4 +240,4 @@ AArch64::ArchKind AArch64::parseCPUArch(StringRef CPU) {
       return C.ArchID;
   }
   return ArchKind::INVALID;
-}
-\ No newline at end of file
+}
diff --git a/llvm/lib/Support/Caching.cpp b/llvm/lib/Support/Caching.cpp
index a2fe37a26617..8c685640f791 100644
--- a/llvm/lib/Support/Caching.cpp
+++ b/llvm/lib/Support/Caching.cpp
@@ -79,14 +79,13 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef,
     struct CacheStream : CachedFileStream {
       AddBufferFn AddBuffer;
       sys::fs::TempFile TempFile;
-      std::string EntryPath;
       unsigned Task;
 
       CacheStream(std::unique_ptr<raw_pwrite_stream> OS, AddBufferFn AddBuffer,
                   sys::fs::TempFile TempFile, std::string EntryPath,
                   unsigned Task)
-          : CachedFileStream(std::move(OS)), AddBuffer(std::move(AddBuffer)),
-            TempFile(std::move(TempFile)), EntryPath(std::move(EntryPath)),
+          : CachedFileStream(std::move(OS), std::move(EntryPath)),
+            AddBuffer(std::move(AddBuffer)), TempFile(std::move(TempFile)),
             Task(Task) {}
 
       ~CacheStream() {
@@ -99,7 +98,7 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef,
         // Open the file first to avoid racing with a cache pruner.
         ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
             MemoryBuffer::getOpenFile(
-                sys::fs::convertFDToNativeFile(TempFile.FD), TempFile.TmpName,
+                sys::fs::convertFDToNativeFile(TempFile.FD), ObjectPathName,
                 /*FileSize=*/-1, /*RequiresNullTerminator=*/false);
         if (!MBOrErr)
           report_fatal_error(Twine("Failed to open new cache file ") +
@@ -115,14 +114,14 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef,
         // AddBuffer a copy of the bytes we wrote in that case. We do this
         // instead of just using the existing file, because the pruner might
         // delete the file before we get a chance to use it.
-        Error E = TempFile.keep(EntryPath);
+        Error E = TempFile.keep(ObjectPathName);
         E = handleErrors(std::move(E), [&](const ECError &E) -> Error {
           std::error_code EC = E.convertToErrorCode();
           if (EC != errc::permission_denied)
             return errorCodeToError(EC);
 
           auto MBCopy = MemoryBuffer::getMemBufferCopy((*MBOrErr)->getBuffer(),
-                                                       EntryPath);
+                                                       ObjectPathName);
           MBOrErr = std::move(MBCopy);
 
           // FIXME: should we consume the discard error?
@@ -133,7 +132,7 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef,
 
         if (E)
           report_fatal_error(Twine("Failed to rename temporary file ") +
-                             TempFile.TmpName + " to " + EntryPath + ": " +
+                             TempFile.TmpName + " to " + ObjectPathName + ": " +
                              toString(std::move(E)) + "\n");
 
         AddBuffer(Task, std::move(*MBOrErr));
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 5b7004c86f5a..4153a69abf5d 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -1538,10 +1538,8 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
 
         ErrorParsing = true;
       } else {
-        for (SmallVectorImpl<Option *>::iterator I = SinkOpts.begin(),
-                                                 E = SinkOpts.end();
-             I != E; ++I)
-          (*I)->addOccurrence(i, "", StringRef(argv[i]));
+        for (Option *SinkOpt : SinkOpts)
+          SinkOpt->addOccurrence(i, "", StringRef(argv[i]));
       }
       continue;
     }
@@ -2303,11 +2301,8 @@ protected:
 
     // Collect registered option categories into vector in preparation for
     // sorting.
-    for (auto I = GlobalParser->RegisteredOptionCategories.begin(),
-              E = GlobalParser->RegisteredOptionCategories.end();
-         I != E; ++I) {
-      SortedCategories.push_back(*I);
-    }
+    for (OptionCategory *Category : GlobalParser->RegisteredOptionCategories)
+      SortedCategories.push_back(Category);
 
     // Sort the different option categories alphabetically.
     assert(SortedCategories.size() > 0 && "No option categories registered!");
@@ -2315,11 +2310,8 @@ protected:
                    OptionCategoryCompare);
 
     // Create map to empty vectors.
-    for (std::vector<OptionCategory *>::const_iterator
-             I = SortedCategories.begin(),
-             E = SortedCategories.end();
-         I != E; ++I)
-      CategorizedOptions[*I] = std::vector<Option *>();
+    for (OptionCategory *Category : SortedCategories)
+      CategorizedOptions[Category] = std::vector<Option *>();
 
     // Walk through pre-sorted options and assign into categories.
     // Because the options are already alphabetically sorted the
@@ -2334,23 +2326,20 @@ protected:
     }
 
     // Now do printing.
-    for (std::vector<OptionCategory *>::const_iterator
-             Category = SortedCategories.begin(),
-             E = SortedCategories.end();
-         Category != E; ++Category) {
+    for (OptionCategory *Category : SortedCategories) {
       // Hide empty categories for --help, but show for --help-hidden.
-      const auto &CategoryOptions = CategorizedOptions[*Category];
+      const auto &CategoryOptions = CategorizedOptions[Category];
       bool IsEmptyCategory = CategoryOptions.empty();
       if (!ShowHidden && IsEmptyCategory)
         continue;
 
       // Print category information.
       outs() << "\n";
-      outs() << (*Category)->getName() << ":\n";
+      outs() << Category->getName() << ":\n";
 
       // Check if description is set.
-      if (!(*Category)->getDescription().empty())
-        outs() << (*Category)->getDescription() << "\n\n";
+      if (!Category->getDescription().empty())
+        outs() << Category->getDescription() << "\n\n";
       else
         outs() << "\n";
 
diff --git a/llvm/lib/Support/Compression.cpp b/llvm/lib/Support/Compression.cpp
index b8c77cf69b95..ccf6ef4bb662 100644
--- a/llvm/lib/Support/Compression.cpp
+++ b/llvm/lib/Support/Compression.cpp
@@ -49,14 +49,14 @@ bool zlib::isAvailable() { return true; }
 Error zlib::compress(StringRef InputBuffer,
                      SmallVectorImpl<char> &CompressedBuffer, int Level) {
   unsigned long CompressedSize = ::compressBound(InputBuffer.size());
-  CompressedBuffer.reserve(CompressedSize);
+  CompressedBuffer.resize_for_overwrite(CompressedSize);
   int Res =
       ::compress2((Bytef *)CompressedBuffer.data(), &CompressedSize,
                   (const Bytef *)InputBuffer.data(), InputBuffer.size(), Level);
   // Tell MemorySanitizer that zlib output buffer is fully initialized.
   // This avoids a false report when running LLVM with uninstrumented ZLib.
   __msan_unpoison(CompressedBuffer.data(), CompressedSize);
-  CompressedBuffer.set_size(CompressedSize);
+  CompressedBuffer.truncate(CompressedSize);
   return Res ? createError(convertZlibCodeToString(Res)) : Error::success();
 }
 
@@ -74,10 +74,10 @@ Error zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
 Error zlib::uncompress(StringRef InputBuffer,
                        SmallVectorImpl<char> &UncompressedBuffer,
                        size_t UncompressedSize) {
-  UncompressedBuffer.reserve(UncompressedSize);
+  UncompressedBuffer.resize_for_overwrite(UncompressedSize);
   Error E =
       uncompress(InputBuffer, UncompressedBuffer.data(), UncompressedSize);
-  UncompressedBuffer.set_size(UncompressedSize);
+  UncompressedBuffer.truncate(UncompressedSize);
   return E;
 }
 
diff --git a/llvm/lib/Support/ConvertUTFWrapper.cpp b/llvm/lib/Support/ConvertUTFWrapper.cpp
index d8d46712a593..392c4c4890e1 100644
--- a/llvm/lib/Support/ConvertUTFWrapper.cpp
+++ b/llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -103,8 +103,8 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
   std::vector<UTF16> ByteSwapped;
   if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {
     ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
-    for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I)
-      ByteSwapped[I] = llvm::ByteSwap_16(ByteSwapped[I]);
+    for (UTF16 &I : ByteSwapped)
+      I = llvm::ByteSwap_16(I);
     Src = &ByteSwapped[0];
     SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;
   }
diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
index e5e6301d41cc..a6daee00bd43 100644
--- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp
+++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
@@ -180,22 +180,19 @@ DAGDeltaAlgorithmImpl::DAGDeltaAlgorithmImpl(
     DAGDeltaAlgorithm &DDA, const changeset_ty &Changes,
     const std::vector<edge_ty> &Dependencies)
     : DDA(DDA) {
-  for (changeset_ty::const_iterator it = Changes.begin(),
-         ie = Changes.end(); it != ie; ++it) {
-    Predecessors.insert(std::make_pair(*it, std::vector<change_ty>()));
-    Successors.insert(std::make_pair(*it, std::vector<change_ty>()));
+  for (change_ty Change : Changes) {
+    Predecessors.insert(std::make_pair(Change, std::vector<change_ty>()));
+    Successors.insert(std::make_pair(Change, std::vector<change_ty>()));
   }
-  for (std::vector<edge_ty>::const_iterator it = Dependencies.begin(),
-         ie = Dependencies.end(); it != ie; ++it) {
-    Predecessors[it->second].push_back(it->first);
-    Successors[it->first].push_back(it->second);
+  for (const edge_ty &Dep : Dependencies) {
+    Predecessors[Dep.second].push_back(Dep.first);
+    Successors[Dep.first].push_back(Dep.second);
   }
 
   // Compute the roots.
-  for (changeset_ty::const_iterator it = Changes.begin(),
-         ie = Changes.end(); it != ie; ++it)
-    if (succ_begin(*it) == succ_end(*it))
-      Roots.push_back(*it);
+  for (change_ty Change : Changes)
+    if (succ_begin(Change) == succ_end(Change))
+      Roots.push_back(Change);
 
   // Pre-compute the closure of the successor relation.
   std::vector<change_ty> Worklist(Roots.begin(), Roots.end());
@@ -213,14 +210,13 @@ DAGDeltaAlgorithmImpl::DAGDeltaAlgorithmImpl(
   }
 
   // Invert to form the predecessor closure map.
-  for (changeset_ty::const_iterator it = Changes.begin(),
-         ie = Changes.end(); it != ie; ++it)
-    PredClosure.insert(std::make_pair(*it, std::set<change_ty>()));
-  for (changeset_ty::const_iterator it = Changes.begin(),
-         ie = Changes.end(); it != ie; ++it)
-    for (succ_closure_iterator_ty it2 = succ_closure_begin(*it),
-           ie2 = succ_closure_end(*it); it2 != ie2; ++it2)
-      PredClosure[*it2].insert(*it);
+  for (change_ty Change : Changes)
+    PredClosure.insert(std::make_pair(Change, std::set<change_ty>()));
+  for (change_ty Change : Changes)
+    for (succ_closure_iterator_ty it2 = succ_closure_begin(Change),
+                                  ie2 = succ_closure_end(Change);
+         it2 != ie2; ++it2)
+      PredClosure[*it2].insert(Change);
 
   // Dump useful debug info.
   LLVM_DEBUG({
@@ -256,13 +252,12 @@ DAGDeltaAlgorithmImpl::DAGDeltaAlgorithmImpl(
     llvm::errs() << "]\n";
 
     llvm::errs() << "Predecessor Closure:\n";
-    for (changeset_ty::const_iterator it = Changes.begin(), ie = Changes.end();
-         it != ie; ++it) {
-      llvm::errs() << format("  %-4d: [", *it);
-      for (pred_closure_iterator_ty it2 = pred_closure_begin(*it),
-                                    ie2 = pred_closure_end(*it);
+    for (change_ty Change : Changes) {
+      llvm::errs() << format("  %-4d: [", Change);
+      for (pred_closure_iterator_ty it2 = pred_closure_begin(Change),
+                                    ie2 = pred_closure_end(Change);
            it2 != ie2; ++it2) {
-        if (it2 != pred_closure_begin(*it))
+        if (it2 != pred_closure_begin(Change))
           llvm::errs() << ", ";
         llvm::errs() << *it2;
       }
@@ -270,13 +265,12 @@ DAGDeltaAlgorithmImpl::DAGDeltaAlgorithmImpl(
     }
 
     llvm::errs() << "Successor Closure:\n";
-    for (changeset_ty::const_iterator it = Changes.begin(), ie = Changes.end();
-         it != ie; ++it) {
-      llvm::errs() << format("  %-4d: [", *it);
-      for (succ_closure_iterator_ty it2 = succ_closure_begin(*it),
-                                    ie2 = succ_closure_end(*it);
+    for (change_ty Change : Changes) {
+      llvm::errs() << format("  %-4d: [", Change);
+      for (succ_closure_iterator_ty it2 = succ_closure_begin(Change),
+                                    ie2 = succ_closure_end(Change);
            it2 != ie2; ++it2) {
-        if (it2 != succ_closure_begin(*it))
+        if (it2 != succ_closure_begin(Change))
           llvm::errs() << ", ";
         llvm::errs() << *it2;
       }
@@ -291,9 +285,8 @@ bool DAGDeltaAlgorithmImpl::GetTestResult(const changeset_ty &Changes,
                                           const changeset_ty &Required) {
   changeset_ty Extended(Required);
   Extended.insert(Changes.begin(), Changes.end());
-  for (changeset_ty::const_iterator it = Changes.begin(),
-         ie = Changes.end(); it != ie; ++it)
-    Extended.insert(pred_closure_begin(*it), pred_closure_end(*it));
+  for (change_ty Change : Changes)
+    Extended.insert(pred_closure_begin(Change), pred_closure_end(Change));
 
   if (FailedTestsCache.count(Extended))
     return false;
@@ -340,9 +333,8 @@ DAGDeltaAlgorithmImpl::Run() {
     // Replace the current set with the predecssors of the minimized set of
     // active changes.
     CurrentSet.clear();
-    for (changeset_ty::const_iterator it = CurrentMinSet.begin(),
-           ie = CurrentMinSet.end(); it != ie; ++it)
-      CurrentSet.insert(pred_begin(*it), pred_end(*it));
+    for (change_ty CT : CurrentMinSet)
+      CurrentSet.insert(pred_begin(CT), pred_end(CT));
 
     // FIXME: We could enforce CurrentSet intersect Required == {} here if we
     // wanted to protect against cyclic graphs.
diff --git a/llvm/lib/Support/DeltaAlgorithm.cpp b/llvm/lib/Support/DeltaAlgorithm.cpp
index 6aee69f43405..a2017a10ab3f 100644
--- a/llvm/lib/Support/DeltaAlgorithm.cpp
+++ b/llvm/lib/Support/DeltaAlgorithm.cpp
@@ -57,9 +57,8 @@ DeltaAlgorithm::Delta(const changeset_ty &Changes,
 
   // Otherwise, partition the sets if possible; if not we are done.
   changesetlist_ty SplitSets;
-  for (changesetlist_ty::const_iterator it = Sets.begin(),
-         ie = Sets.end(); it != ie; ++it)
-    Split(*it, SplitSets);
+  for (const changeset_ty &Set : Sets)
+    Split(Set, SplitSets);
   if (SplitSets.size() == Sets.size())
     return Changes;
 
diff --git a/llvm/lib/Support/HTTPClient.cpp b/llvm/lib/Support/HTTPClient.cpp
deleted file mode 100644
index 68ba56d1fe50..000000000000
--- a/llvm/lib/Support/HTTPClient.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//===-- llvm/Support/HTTPClient.cpp - HTTP client library -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-///
-/// This file defines the methods of the HTTPRequest, HTTPClient, and
-/// BufferedHTTPResponseHandler classes.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/HTTPClient.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
-
-using namespace llvm;
-
-HTTPRequest::HTTPRequest(StringRef Url) { this->Url = Url.str(); }
-
-bool operator==(const HTTPRequest &A, const HTTPRequest &B) {
-  return A.Url == B.Url && A.Method == B.Method &&
-         A.FollowRedirects == B.FollowRedirects;
-}
-
-HTTPResponseHandler::~HTTPResponseHandler() = default;
-
-static inline bool parseContentLengthHeader(StringRef LineRef,
-                                            size_t &ContentLength) {
-  // Content-Length is a mandatory header, and the only one we handle.
-  return LineRef.consume_front("Content-Length: ") &&
-         to_integer(LineRef.trim(), ContentLength, 10);
-}
-
-Error BufferedHTTPResponseHandler::handleHeaderLine(StringRef HeaderLine) {
-  if (ResponseBuffer.Body)
-    return Error::success();
-
-  size_t ContentLength;
-  if (parseContentLengthHeader(HeaderLine, ContentLength))
-    ResponseBuffer.Body =
-        WritableMemoryBuffer::getNewUninitMemBuffer(ContentLength);
-
-  return Error::success();
-}
-
-Error BufferedHTTPResponseHandler::handleBodyChunk(StringRef BodyChunk) {
-  if (!ResponseBuffer.Body)
-    return createStringError(errc::io_error,
-                             "Unallocated response buffer. HTTP Body data "
-                             "received before Content-Length header.");
-  if (Offset + BodyChunk.size() > ResponseBuffer.Body->getBufferSize())
-    return createStringError(errc::io_error,
-                             "Content size exceeds buffer size.");
-  memcpy(ResponseBuffer.Body->getBufferStart() + Offset, BodyChunk.data(),
-         BodyChunk.size());
-  Offset += BodyChunk.size();
-  return Error::success();
-}
-
-Error BufferedHTTPResponseHandler::handleStatusCode(unsigned Code) {
-  ResponseBuffer.Code = Code;
-  return Error::success();
-}
-
-Expected<HTTPResponseBuffer> HTTPClient::perform(const HTTPRequest &Request) {
-  BufferedHTTPResponseHandler Handler;
-  if (Error Err = perform(Request, Handler))
-    return std::move(Err);
-  return std::move(Handler.ResponseBuffer);
-}
-
-Expected<HTTPResponseBuffer> HTTPClient::get(StringRef Url) {
-  HTTPRequest Request(Url);
-  return perform(Request);
-}
-
-HTTPClient::HTTPClient() = default;
-
-HTTPClient::~HTTPClient() = default;
-
-bool HTTPClient::isAvailable() { return false; }
-
-void HTTPClient::cleanup() {}
-
-void HTTPClient::setTimeout(std::chrono::milliseconds Timeout) {}
-
-Error HTTPClient::perform(const HTTPRequest &Request,
-                          HTTPResponseHandler &Handler) {
-  llvm_unreachable("No HTTP Client implementation available.");
-}
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 554e3248524c..8e154067abc0 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -420,11 +420,19 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
   assert((!SelfMultiply || (LHS.One == RHS.One && LHS.Zero == RHS.Zero)) &&
          "Self multiplication knownbits mismatch");
 
-  // Compute a conservative estimate for high known-0 bits.
-  unsigned LHSLeadZ = LHS.countMinLeadingZeros();
-  unsigned RHSLeadZ = RHS.countMinLeadingZeros();
-  unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ, BitWidth) - BitWidth;
-  assert(LeadZ <= BitWidth && "More zeros than bits?");
+  // Compute the high known-0 bits by multiplying the unsigned max of each side.
+  // Conservatively, M active bits * N active bits results in M + N bits in the
+  // result. But if we know a value is a power-of-2 for example, then this
+  // computes one more leading zero.
+  // TODO: This could be generalized to number of sign bits (negative numbers).
+  APInt UMaxLHS = LHS.getMaxValue();
+  APInt UMaxRHS = RHS.getMaxValue();
+
+  // For leading zeros in the result to be valid, the unsigned max product must
+  // fit in the bitwidth (it must not overflow).
+  bool HasOverflow;
+  APInt UMaxResult = UMaxLHS.umul_ov(UMaxRHS, HasOverflow);
+  unsigned LeadZ = HasOverflow ? 0 : UMaxResult.countLeadingZeros();
 
   // The result of the bottom bits of an integer multiply can be
   // inferred by looking at the bottom bits of both operands and
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index bcf13d828a5d..d3fa3c6f065d 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -227,17 +227,20 @@ static ErrorOr<std::unique_ptr<WritableMemoryBuffer>>
 getMemoryBufferForStream(sys::fs::file_t FD, const Twine &BufferName) {
   const ssize_t ChunkSize = 4096*4;
   SmallString<ChunkSize> Buffer;
+
   // Read into Buffer until we hit EOF.
+  size_t Size = Buffer.size();
   for (;;) {
-    Buffer.reserve(Buffer.size() + ChunkSize);
+    Buffer.resize_for_overwrite(Size + ChunkSize);
     Expected<size_t> ReadBytes = sys::fs::readNativeFile(
-        FD, makeMutableArrayRef(Buffer.end(), ChunkSize));
+        FD, makeMutableArrayRef(Buffer.begin() + Size, ChunkSize));
     if (!ReadBytes)
       return errorToErrorCode(ReadBytes.takeError());
     if (*ReadBytes == 0)
       break;
-    Buffer.set_size(Buffer.size() + *ReadBytes);
+    Size += *ReadBytes;
   }
+  Buffer.truncate(Size);
 
   return getMemBufferCopyImpl(Buffer, BufferName);
 }
diff --git a/llvm/lib/Support/NativeFormatting.cpp b/llvm/lib/Support/NativeFormatting.cpp
index ae9f03745850..254d18d797b3 100644
--- a/llvm/lib/Support/NativeFormatting.cpp
+++ b/llvm/lib/Support/NativeFormatting.cpp
@@ -168,7 +168,7 @@ void llvm::write_double(raw_ostream &S, double N, FloatStyle Style,
     S << "nan";
     return;
   } else if (std::isinf(N)) {
-    S << "INF";
+    S << (std::signbit(N) ? "-INF" : "INF");
     return;
   }
 
diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index 3957547dfaaa..7c99d088911c 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -474,7 +474,7 @@ StringRef parent_path(StringRef path, Style style) {
 void remove_filename(SmallVectorImpl<char> &path, Style style) {
   size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()), style);
   if (end_pos != StringRef::npos)
-    path.set_size(end_pos);
+    path.truncate(end_pos);
 }
 
 void replace_extension(SmallVectorImpl<char> &path, const Twine &extension,
@@ -486,7 +486,7 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension,
   // Erase existing extension.
   size_t pos = p.find_last_of('.');
   if (pos != StringRef::npos && pos >= filename_pos(p, style))
-    path.set_size(pos);
+    path.truncate(pos);
 
   // Append '.' if needed.
   if (ext.size() > 0 && ext[0] != '.')
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 8e984002f90d..e2e4340f44e9 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -61,7 +61,6 @@ static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
     {"zbs", RISCVExtensionVersion{1, 0}},
     {"zbt", RISCVExtensionVersion{0, 93}},
 
-    {"zvamo", RISCVExtensionVersion{0, 10}},
     {"zvlsseg", RISCVExtensionVersion{0, 10}},
 
     {"zfhmin", RISCVExtensionVersion{0, 1}},
@@ -72,6 +71,28 @@ static bool stripExperimentalPrefix(StringRef &Ext) {
   return Ext.consume_front("experimental-");
 }
 
+// This function finds the first character that doesn't belong to a version
+// (e.g. zbe0p93 is extension 'zbe' of version '0p93'). So the function will
+// consume [0-9]*p[0-9]* starting from the backward. An extension name will not
+// end with a digit or the letter 'p', so this function will parse correctly.
+// NOTE: This function is NOT able to take empty strings or strings that only
+// have version numbers and no extension name. It assumes the extension name
+// will be at least more than one character.
+static size_t findFirstNonVersionCharacter(const StringRef &Ext) {
+  if (Ext.size() == 0)
+    llvm_unreachable("Already guarded by if-statement in ::parseArchString");
+
+  int Pos = Ext.size() - 1;
+  while (Pos > 0 && isDigit(Ext[Pos]))
+    Pos--;
+  if (Pos > 0 && Ext[Pos] == 'p' && isDigit(Ext[Pos - 1])) {
+    Pos--;
+    while (Pos > 0 && isDigit(Ext[Pos]))
+      Pos--;
+  }
+  return Pos;
+}
+
 struct FindByName {
   FindByName(StringRef Ext) : Ext(Ext){};
   StringRef Ext;
@@ -264,10 +285,6 @@ void RISCVISAInfo::toFeatures(
     if (ExtName == "zvlsseg") {
       Features.push_back("+experimental-v");
       Features.push_back("+experimental-zvlsseg");
-    } else if (ExtName == "zvamo") {
-      Features.push_back("+experimental-v");
-      Features.push_back("+experimental-zvlsseg");
-      Features.push_back("+experimental-zvamo");
     } else if (isExperimentalExtension(ExtName)) {
       Features.push_back(StrAlloc("+experimental-" + ExtName));
     } else {
@@ -390,7 +407,6 @@ RISCVISAInfo::parseFeatures(unsigned XLen,
   assert(XLen == 32 || XLen == 64);
   std::unique_ptr<RISCVISAInfo> ISAInfo(new RISCVISAInfo(XLen));
 
-  bool HasE = false;
   for (auto &Feature : Features) {
     StringRef ExtName = Feature;
     bool Experimental = false;
@@ -409,29 +425,19 @@ RISCVISAInfo::parseFeatures(unsigned XLen,
     if (ExtensionInfoIterator == ExtensionInfos.end())
       continue;
 
-    if (Add) {
-      if (ExtName == "e") {
-        if (XLen != 32)
-          return createStringError(
-              errc::invalid_argument,
-              "standard user-level extension 'e' requires 'rv32'");
-        HasE = true;
-      }
-
+    if (Add)
       ISAInfo->addExtension(ExtName, ExtensionInfoIterator->Version.Major,
                             ExtensionInfoIterator->Version.Minor);
-    } else
-      ISAInfo->Exts.erase(ExtName.str());
-  }
-  if (!HasE) {
-    if (auto Version = findDefaultVersion("i"))
-      ISAInfo->addExtension("i", Version->Major, Version->Minor);
     else
-      llvm_unreachable("Default extension version for 'i' not found?");
+      ISAInfo->Exts.erase(ExtName.str());
   }
 
+  ISAInfo->updateImplication();
   ISAInfo->updateFLen();
 
+  if (Error Result = ISAInfo->checkDependency())
+    return std::move(Result);
+
   return std::move(ISAInfo);
 }
 
@@ -457,7 +463,6 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
   // The canonical order specified in ISA manual.
   // Ref: Table 22.1 in RISC-V User-Level ISA V2.2
   StringRef StdExts = AllStdExts;
-  bool HasF = false, HasD = false;
   char Baseline = Arch[4];
 
   // First letter should be 'e', 'i' or 'g'.
@@ -478,8 +483,6 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
   case 'g':
     // g = imafd
     StdExts = StdExts.drop_front(4);
-    HasF = true;
-    HasD = true;
     break;
   }
 
@@ -560,34 +563,14 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
 
     // The order is OK, then push it into features.
     // TODO: Use version number when setting target features
-    switch (C) {
-    default:
-      // Currently LLVM supports only "mafdcbv".
+    // Currently LLVM supports only "mafdcbv".
+    StringRef SupportedStandardExtension = "mafdcbv";
+    if (!SupportedStandardExtension.contains(C))
       return createStringError(errc::invalid_argument,
                                "unsupported standard user-level extension '%c'",
                                C);
-    case 'm':
-      ISAInfo->addExtension("m", Major, Minor);
-      break;
-    case 'a':
-      ISAInfo->addExtension("a", Major, Minor);
-      break;
-    case 'f':
-      ISAInfo->addExtension("f", Major, Minor);
-      HasF = true;
-      break;
-    case 'd':
-      ISAInfo->addExtension("d", Major, Minor);
-      HasD = true;
-      break;
-    case 'c':
-      ISAInfo->addExtension("c", Major, Minor);
-      break;
-    case 'v':
-      ISAInfo->addExtension("v", Major, Minor);
-      ISAInfo->addExtension("zvlsseg", Major, Minor);
-      break;
-    }
+    ISAInfo->addExtension(std::string(1, C), Major, Minor);
+
     // Consume full extension name and version, including any optional '_'
     // between this extension and the next
     ++I;
@@ -595,21 +578,6 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
     if (*I == '_')
       ++I;
   }
-  // Dependency check.
-  // It's illegal to specify the 'd' (double-precision floating point)
-  // extension without also specifying the 'f' (single precision
-  // floating-point) extension.
-  // TODO: This has been removed in later specs, which specify that D implies F
-  if (HasD && !HasF)
-    return createStringError(errc::invalid_argument,
-                             "d requires f extension to also be specified");
-
-  // Additional dependency checks.
-  // TODO: The 'q' extension requires rv64.
-  // TODO: It is illegal to specify 'e' extensions with 'f' and 'd'.
-
-  if (OtherExts.empty())
-    return std::move(ISAInfo);
 
   // Handle other types of extensions other than the standard
   // general purpose and standard user-level extensions.
@@ -630,52 +598,53 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
   std::array<StringRef, 4> Prefix{"z", "x", "s", "sx"};
   auto I = Prefix.begin();
   auto E = Prefix.end();
+  if (Split.size() > 1 || Split[0] != "") {
+    for (StringRef Ext : Split) {
+      if (Ext.empty())
+        return createStringError(errc::invalid_argument,
+                                 "extension name missing after separator '_'");
 
-  for (StringRef Ext : Split) {
-    if (Ext.empty())
-      return createStringError(errc::invalid_argument,
-                               "extension name missing after separator '_'");
+      StringRef Type = getExtensionType(Ext);
+      StringRef Desc = getExtensionTypeDesc(Ext);
+      auto Pos = findFirstNonVersionCharacter(Ext) + 1;
+      StringRef Name(Ext.substr(0, Pos));
+      StringRef Vers(Ext.substr(Pos));
 
-    StringRef Type = getExtensionType(Ext);
-    StringRef Desc = getExtensionTypeDesc(Ext);
-    auto Pos = Ext.find_if(isDigit);
-    StringRef Name(Ext.substr(0, Pos));
-    StringRef Vers(Ext.substr(Pos));
+      if (Type.empty())
+        return createStringError(errc::invalid_argument,
+                                 "invalid extension prefix '" + Ext + "'");
 
-    if (Type.empty())
-      return createStringError(errc::invalid_argument,
-                               "invalid extension prefix '" + Ext + "'");
+      // Check ISA extensions are specified in the canonical order.
+      while (I != E && *I != Type)
+        ++I;
 
-    // Check ISA extensions are specified in the canonical order.
-    while (I != E && *I != Type)
-      ++I;
+      if (I == E)
+        return createStringError(errc::invalid_argument,
+                                 "%s not given in canonical order '%s'",
+                                 Desc.str().c_str(), Ext.str().c_str());
 
-    if (I == E)
-      return createStringError(errc::invalid_argument,
-                               "%s not given in canonical order '%s'",
-                               Desc.str().c_str(), Ext.str().c_str());
-
-    if (Name.size() == Type.size()) {
-      return createStringError(errc::invalid_argument,
-                               "%s name missing after '%s'", Desc.str().c_str(),
-                               Type.str().c_str());
-    }
+      if (Name.size() == Type.size()) {
+        return createStringError(errc::invalid_argument,
+                                 "%s name missing after '%s'",
+                                 Desc.str().c_str(), Type.str().c_str());
+      }
 
-    unsigned Major, Minor, ConsumeLength;
-    if (auto E = getExtensionVersion(Name, Vers, Major, Minor, ConsumeLength,
-                                     EnableExperimentalExtension,
-                                     ExperimentalExtensionVersionCheck))
-      return std::move(E);
+      unsigned Major, Minor, ConsumeLength;
+      if (auto E = getExtensionVersion(Name, Vers, Major, Minor, ConsumeLength,
+                                       EnableExperimentalExtension,
+                                       ExperimentalExtensionVersionCheck))
+        return std::move(E);
 
-    // Check if duplicated extension.
-    if (llvm::is_contained(AllExts, Name))
-      return createStringError(errc::invalid_argument, "duplicated %s '%s'",
-                               Desc.str().c_str(), Name.str().c_str());
+      // Check if duplicated extension.
+      if (llvm::is_contained(AllExts, Name))
+        return createStringError(errc::invalid_argument, "duplicated %s '%s'",
+                                 Desc.str().c_str(), Name.str().c_str());
 
-    ISAInfo->addExtension(Name, Major, Minor);
-    // Extension format is correct, keep parsing the extensions.
-    // TODO: Save Type, Name, Major, Minor to avoid parsing them later.
-    AllExts.push_back(Name);
+      ISAInfo->addExtension(Name, Major, Minor);
+      // Extension format is correct, keep parsing the extensions.
+      // TODO: Save Type, Name, Major, Minor to avoid parsing them later.
+      AllExts.push_back(Name);
+    }
   }
 
   for (auto Ext : AllExts) {
@@ -686,11 +655,83 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
     }
   }
 
+  ISAInfo->updateImplication();
   ISAInfo->updateFLen();
 
+  if (Error Result = ISAInfo->checkDependency())
+    return std::move(Result);
+
   return std::move(ISAInfo);
 }
 
+Error RISCVISAInfo::checkDependency() {
+  bool IsRv32 = XLen == 32;
+  bool HasE = Exts.count("e") == 1;
+  bool HasD = Exts.count("d") == 1;
+  bool HasF = Exts.count("f") == 1;
+
+  if (HasE && !IsRv32)
+    return createStringError(
+        errc::invalid_argument,
+        "standard user-level extension 'e' requires 'rv32'");
+
+  // It's illegal to specify the 'd' (double-precision floating point)
+  // extension without also specifying the 'f' (single precision
+  // floating-point) extension.
+  // TODO: This has been removed in later specs, which specify that D implies F
+  if (HasD && !HasF)
+    return createStringError(errc::invalid_argument,
+                             "d requires f extension to also be specified");
+
+  // Additional dependency checks.
+  // TODO: The 'q' extension requires rv64.
+  // TODO: It is illegal to specify 'e' extensions with 'f' and 'd'.
+
+  return Error::success();
+}
+
+static const char *ImpliedExtsV[] = {"zvlsseg"};
+static const char *ImpliedExtsZfh[] = {"zfhmin"};
+
+struct ImpliedExtsEntry {
+  StringLiteral Name;
+  ArrayRef<const char *> Exts;
+
+  bool operator<(const ImpliedExtsEntry &Other) const {
+    return Name < Other.Name;
+  }
+
+  bool operator<(StringRef Other) const { return Name < Other; }
+};
+
+static constexpr ImpliedExtsEntry ImpliedExts[] = {
+    {{"v"}, {ImpliedExtsV}},
+    {{"zfh"}, {ImpliedExtsZfh}},
+};
+
+void RISCVISAInfo::updateImplication() {
+  bool HasE = Exts.count("e") == 1;
+  bool HasI = Exts.count("i") == 1;
+
+  // If not in e extension and i extension does not exist, i extension is
+  // implied
+  if (!HasE && !HasI) {
+    auto Version = findDefaultVersion("i");
+    addExtension("i", Version->Major, Version->Minor);
+  }
+
+  assert(llvm::is_sorted(ImpliedExts) && "Table not sorted by Name");
+  for (auto &Ext : Exts) {
+    auto I = llvm::lower_bound(ImpliedExts, Ext.first);
+    if (I != std::end(ImpliedExts) && I->Name == Ext.first) {
+      for (auto &ImpliedExt : I->Exts) {
+        auto Version = findDefaultVersion(ImpliedExt);
+        addExtension(ImpliedExt, Version->Major, Version->Minor);
+      }
+    }
+  }
+}
+
 void RISCVISAInfo::updateFLen() {
   FLen = 0;
   // TODO: Handle q extension.
diff --git a/llvm/lib/Support/ScopedPrinter.cpp b/llvm/lib/Support/ScopedPrinter.cpp
index 779c6c45257d..ea90a24eaced 100644
--- a/llvm/lib/Support/ScopedPrinter.cpp
+++ b/llvm/lib/Support/ScopedPrinter.cpp
@@ -43,4 +43,14 @@ void ScopedPrinter::printBinaryImpl(StringRef Label, StringRef Str,
   }
 }
 
+JSONScopedPrinter::JSONScopedPrinter(
+    raw_ostream &OS, bool PrettyPrint,
+    std::unique_ptr<DelimitedScope> &&OuterScope)
+    : ScopedPrinter(OS, ScopedPrinter::ScopedPrinterKind::JSON),
+      JOS(OS, /*Indent=*/PrettyPrint ? 2 : 0),
+      OuterScope(std::move(OuterScope)) {
+  if (this->OuterScope)
+    this->OuterScope->setPrinter(*this);
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Support/Signals.cpp b/llvm/lib/Support/Signals.cpp
index dd4dded4cd1d..c018dc92bf40 100644
--- a/llvm/lib/Support/Signals.cpp
+++ b/llvm/lib/Support/Signals.cpp
@@ -87,8 +87,7 @@ static CallbackAndCookie CallBacksToRun[MaxSignalHandlerCallbacks];
 
 // Signal-safe.
 void sys::RunSignalHandlers() {
-  for (size_t I = 0; I < MaxSignalHandlerCallbacks; ++I) {
-    auto &RunMe = CallBacksToRun[I];
+  for (CallbackAndCookie &RunMe : CallBacksToRun) {
     auto Expected = CallbackAndCookie::Status::Initialized;
     auto Desired = CallbackAndCookie::Status::Executing;
     if (!RunMe.Flag.compare_exchange_strong(Expected, Desired))
@@ -103,8 +102,7 @@ void sys::RunSignalHandlers() {
 // Signal-safe.
 static void insertSignalHandler(sys::SignalHandlerCallback FnPtr,
                                 void *Cookie) {
-  for (size_t I = 0; I < MaxSignalHandlerCallbacks; ++I) {
-    auto &SetMe = CallBacksToRun[I];
+  for (CallbackAndCookie &SetMe : CallBacksToRun) {
     auto Expected = CallbackAndCookie::Status::Empty;
     auto Desired = CallbackAndCookie::Status::Initializing;
     if (!SetMe.Flag.compare_exchange_strong(Expected, Desired))
diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp
index 89b7dc939dfc..2eb2989b200b 100644
--- a/llvm/lib/Support/SourceMgr.cpp
+++ b/llvm/lib/Support/SourceMgr.cpp
@@ -292,8 +292,7 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
 
     // Convert any ranges to column ranges that only intersect the line of the
     // location.
-    for (unsigned i = 0, e = Ranges.size(); i != e; ++i) {
-      SMRange R = Ranges[i];
+    for (SMRange R : Ranges) {
       if (!R.isValid())
         continue;
 
diff --git a/llvm/lib/Support/Statistic.cpp b/llvm/lib/Support/Statistic.cpp
index d95c8642c16e..95ee885d2f8f 100644
--- a/llvm/lib/Support/Statistic.cpp
+++ b/llvm/lib/Support/Statistic.cpp
@@ -177,11 +177,10 @@ void llvm::PrintStatistics(raw_ostream &OS) {
 
   // Figure out how long the biggest Value and Name fields are.
   unsigned MaxDebugTypeLen = 0, MaxValLen = 0;
-  for (size_t i = 0, e = Stats.Stats.size(); i != e; ++i) {
-    MaxValLen = std::max(MaxValLen,
-                         (unsigned)utostr(Stats.Stats[i]->getValue()).size());
-    MaxDebugTypeLen = std::max(MaxDebugTypeLen,
-                         (unsigned)std::strlen(Stats.Stats[i]->getDebugType()));
+  for (TrackingStatistic *Stat : Stats.Stats) {
+    MaxValLen = std::max(MaxValLen, (unsigned)utostr(Stat->getValue()).size());
+    MaxDebugTypeLen =
+        std::max(MaxDebugTypeLen, (unsigned)std::strlen(Stat->getDebugType()));
   }
 
   Stats.sort();
@@ -192,11 +191,9 @@ void llvm::PrintStatistics(raw_ostream &OS) {
      << "===" << std::string(73, '-') << "===\n\n";
 
   // Print all of the statistics.
-  for (size_t i = 0, e = Stats.Stats.size(); i != e; ++i)
-    OS << format("%*u %-*s - %s\n",
-                 MaxValLen, Stats.Stats[i]->getValue(),
-                 MaxDebugTypeLen, Stats.Stats[i]->getDebugType(),
-                 Stats.Stats[i]->getDesc());
+  for (TrackingStatistic *Stat : Stats.Stats)
+    OS << format("%*u %-*s - %s\n", MaxValLen, Stat->getValue(),
+                 MaxDebugTypeLen, Stat->getDebugType(), Stat->getDesc());
 
   OS << '\n';  // Flush the output stream.
   OS.flush();
diff --git a/llvm/lib/Support/TargetParser.cpp b/llvm/lib/Support/TargetParser.cpp
index 4acc23dd455b..bc60bdea5f62 100644
--- a/llvm/lib/Support/TargetParser.cpp
+++ b/llvm/lib/Support/TargetParser.cpp
@@ -331,6 +331,21 @@ bool getCPUFeaturesExceptStdExt(CPUKind Kind,
   return true;
 }
 
+StringRef computeDefaultABIFromArch(const llvm::RISCVISAInfo &ISAInfo) {
+  if (ISAInfo.getXLen() == 32) {
+    if (ISAInfo.hasExtension("d"))
+      return "ilp32d";
+    if (ISAInfo.hasExtension("e"))
+      return "ilp32e";
+    return "ilp32";
+  } else if (ISAInfo.getXLen() == 64) {
+    if (ISAInfo.hasExtension("d"))
+      return "lp64d";
+    return "lp64";
+  }
+  llvm_unreachable("Invalid XLEN");
+}
+
 } // namespace RISCV
 } // namespace llvm
 
diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp
index c11e16d3cf98..54ea84d4bd6d 100644
--- a/llvm/lib/Support/ThreadPool.cpp
+++ b/llvm/lib/Support/ThreadPool.cpp
@@ -21,13 +21,17 @@ using namespace llvm;
 #if LLVM_ENABLE_THREADS
 
 ThreadPool::ThreadPool(ThreadPoolStrategy S)
-    : ThreadCount(S.compute_thread_count()) {
-  // Create ThreadCount threads that will loop forever, wait on QueueCondition
-  // for tasks to be queued or the Pool to be destroyed.
-  Threads.reserve(ThreadCount);
-  for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) {
-    Threads.emplace_back([S, ThreadID, this] {
-      S.apply_thread_strategy(ThreadID);
+    : Strategy(S), MaxThreadCount(S.compute_thread_count()) {}
+
+void ThreadPool::grow(int requested) {
+  std::unique_lock<std::mutex> LockGuard(ThreadsLock);
+  if (Threads.size() >= MaxThreadCount)
+    return; // Already hit the max thread pool size.
+  int newThreadCount = std::min<int>(requested, MaxThreadCount);
+  while (static_cast<int>(Threads.size()) < newThreadCount) {
+    int ThreadID = Threads.size();
+    Threads.emplace_back([this, ThreadID] {
+      Strategy.apply_thread_strategy(ThreadID);
       while (true) {
         std::function<void()> Task;
         {
@@ -73,6 +77,7 @@ void ThreadPool::wait() {
 }
 
 bool ThreadPool::isWorkerThread() const {
+  std::unique_lock<std::mutex> LockGuard(ThreadsLock);
   llvm::thread::id CurrentThreadId = llvm::this_thread::get_id();
   for (const llvm::thread &Thread : Threads)
     if (CurrentThreadId == Thread.get_id())
@@ -87,6 +92,7 @@ ThreadPool::~ThreadPool() {
     EnableFlag = false;
   }
   QueueCondition.notify_all();
+  std::unique_lock<std::mutex> LockGuard(ThreadsLock);
   for (auto &Worker : Threads)
     Worker.join();
 }
@@ -94,8 +100,8 @@ ThreadPool::~ThreadPool() {
 #else // LLVM_ENABLE_THREADS Disabled
 
 // No threads are launched, issue a warning if ThreadCount is not 0
-ThreadPool::ThreadPool(ThreadPoolStrategy S)
-    : ThreadCount(S.compute_thread_count()) {
+ThreadPool::ThreadPool(ThreadPoolStrategy S) : MaxThreadCount(1) {
+  int ThreadCount = S.compute_thread_count();
   if (ThreadCount != 1) {
     errs() << "Warning: request a ThreadPool with " << ThreadCount
            << " threads, but LLVM_ENABLE_THREADS has been turned off\n";
diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp
index b9a92e280576..2819dc0c139a 100644
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@@ -989,10 +989,9 @@ std::string Triple::normalize(StringRef Str) {
   }
 
   // Replace empty components with "unknown" value.
-  for (unsigned i = 0, e = Components.size(); i < e; ++i) {
-    if (Components[i].empty())
-      Components[i] = "unknown";
-  }
+  for (StringRef &C : Components)
+    if (C.empty())
+      C = "unknown";
 
   // Special case logic goes here.  At this point Arch, Vendor and OS have the
   // correct values for the computed components.
@@ -1091,53 +1090,22 @@ StringRef Triple::getOSAndEnvironmentName() const {
   return Tmp.split('-').second;                      // Strip second component
 }
 
-static unsigned EatNumber(StringRef &Str) {
-  assert(!Str.empty() && isDigit(Str[0]) && "Not a number");
-  unsigned Result = 0;
-
-  do {
-    // Consume the leading digit.
-    Result = Result*10 + (Str[0] - '0');
-
-    // Eat the digit.
-    Str = Str.substr(1);
-  } while (!Str.empty() && isDigit(Str[0]));
-
-  return Result;
-}
-
-static void parseVersionFromName(StringRef Name, unsigned &Major,
-                                 unsigned &Minor, unsigned &Micro) {
-  // Any unset version defaults to 0.
-  Major = Minor = Micro = 0;
-
-  // Parse up to three components.
-  unsigned *Components[3] = {&Major, &Minor, &Micro};
-  for (unsigned i = 0; i != 3; ++i) {
-    if (Name.empty() || Name[0] < '0' || Name[0] > '9')
-      break;
-
-    // Consume the leading number.
-    *Components[i] = EatNumber(Name);
-
-    // Consume the separator, if present.
-    if (Name.startswith("."))
-      Name = Name.substr(1);
-  }
+static VersionTuple parseVersionFromName(StringRef Name) {
+  VersionTuple Version;
+  Version.tryParse(Name);
+  return Version.withoutBuild();
 }
 
-void Triple::getEnvironmentVersion(unsigned &Major, unsigned &Minor,
-                                   unsigned &Micro) const {
+VersionTuple Triple::getEnvironmentVersion() const {
   StringRef EnvironmentName = getEnvironmentName();
   StringRef EnvironmentTypeName = getEnvironmentTypeName(getEnvironment());
   if (EnvironmentName.startswith(EnvironmentTypeName))
     EnvironmentName = EnvironmentName.substr(EnvironmentTypeName.size());
 
-  parseVersionFromName(EnvironmentName, Major, Minor, Micro);
+  return parseVersionFromName(EnvironmentName);
 }
 
-void Triple::getOSVersion(unsigned &Major, unsigned &Minor,
-                          unsigned &Micro) const {
+VersionTuple Triple::getOSVersion() const {
   StringRef OSName = getOSName();
   // Assume that the OS portion of the triple starts with the canonical name.
   StringRef OSTypeName = getOSTypeName(getOS());
@@ -1146,40 +1114,36 @@ void Triple::getOSVersion(unsigned &Major, unsigned &Minor,
   else if (getOS() == MacOSX)
     OSName.consume_front("macos");
 
-  parseVersionFromName(OSName, Major, Minor, Micro);
+  return parseVersionFromName(OSName);
 }
 
-bool Triple::getMacOSXVersion(unsigned &Major, unsigned &Minor,
-                              unsigned &Micro) const {
-  getOSVersion(Major, Minor, Micro);
+bool Triple::getMacOSXVersion(VersionTuple &Version) const {
+  Version = getOSVersion();
 
   switch (getOS()) {
   default: llvm_unreachable("unexpected OS for Darwin triple");
   case Darwin:
     // Default to darwin8, i.e., MacOSX 10.4.
-    if (Major == 0)
-      Major = 8;
+    if (Version.getMajor() == 0)
+      Version = VersionTuple(8);
     // Darwin version numbers are skewed from OS X versions.
-    if (Major < 4)
+    if (Version.getMajor() < 4) {
       return false;
-    if (Major <= 19) {
-      Micro = 0;
-      Minor = Major - 4;
-      Major = 10;
+    }
+    if (Version.getMajor() <= 19) {
+      Version = VersionTuple(10, Version.getMajor() - 4);
     } else {
-      Micro = 0;
-      Minor = 0;
       // darwin20+ corresponds to macOS 11+.
-      Major = 11 + Major - 20;
+      Version = VersionTuple(11 + Version.getMajor() - 20);
     }
     break;
   case MacOSX:
     // Default to 10.4.
-    if (Major == 0) {
-      Major = 10;
-      Minor = 4;
-    } else if (Major < 10)
+    if (Version.getMajor() == 0) {
+      Version = VersionTuple(10, 4);
+    } else if (Version.getMajor() < 10) {
       return false;
+    }
     break;
   case IOS:
   case TvOS:
@@ -1188,16 +1152,13 @@ bool Triple::getMacOSXVersion(unsigned &Major, unsigned &Minor,
     // the clang driver combines OS X and IOS support into a common Darwin
     // toolchain that wants to know the OS X version number even when targeting
     // IOS.
-    Major = 10;
-    Minor = 4;
-    Micro = 0;
+    Version = VersionTuple(10, 4);
     break;
   }
   return true;
 }
 
-void Triple::getiOSVersion(unsigned &Major, unsigned &Minor,
-                           unsigned &Micro) const {
+VersionTuple Triple::getiOSVersion() const {
   switch (getOS()) {
   default: llvm_unreachable("unexpected OS for Darwin triple");
   case Darwin:
@@ -1206,24 +1167,21 @@ void Triple::getiOSVersion(unsigned &Major, unsigned &Minor,
     // the clang driver combines OS X and IOS support into a common Darwin
     // toolchain that wants to know the iOS version number even when targeting
     // OS X.
-    Major = 5;
-    Minor = 0;
-    Micro = 0;
-    break;
+    return VersionTuple(5);
   case IOS:
-  case TvOS:
-    getOSVersion(Major, Minor, Micro);
+  case TvOS: {
+    VersionTuple Version = getOSVersion();
     // Default to 5.0 (or 7.0 for arm64).
-    if (Major == 0)
-      Major = (getArch() == aarch64) ? 7 : 5;
-    break;
+    if (Version.getMajor() == 0)
+      return (getArch() == aarch64) ? VersionTuple(7) : VersionTuple(5);
+    return Version;
+  }
   case WatchOS:
     llvm_unreachable("conflicting triple info");
   }
 }
 
-void Triple::getWatchOSVersion(unsigned &Major, unsigned &Minor,
-                               unsigned &Micro) const {
+VersionTuple Triple::getWatchOSVersion() const {
   switch (getOS()) {
   default: llvm_unreachable("unexpected OS for Darwin triple");
   case Darwin:
@@ -1232,15 +1190,13 @@ void Triple::getWatchOSVersion(unsigned &Major, unsigned &Minor,
     // the clang driver combines OS X and IOS support into a common Darwin
     // toolchain that wants to know the iOS version number even when targeting
     // OS X.
-    Major = 2;
-    Minor = 0;
-    Micro = 0;
-    break;
-  case WatchOS:
-    getOSVersion(Major, Minor, Micro);
-    if (Major == 0)
-      Major = 2;
-    break;
+    return VersionTuple(2);
+  case WatchOS: {
+    VersionTuple Version = getOSVersion();
+    if (Version.getMajor() == 0)
+      return VersionTuple(2);
+    return Version;
+  }
   case IOS:
     llvm_unreachable("conflicting triple info");
   }
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 19d89db55627..f5cb5895d95d 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -590,19 +590,6 @@ std::error_code rename(const Twine &from, const Twine &to) {
 }
 
 std::error_code resize_file(int FD, uint64_t Size) {
-#if defined(HAVE_POSIX_FALLOCATE)
-  // If we have posix_fallocate use it. Unlike ftruncate it always allocates
-  // space, so we get an error if the disk is full.
-  if (int Err = ::posix_fallocate(FD, 0, Size)) {
-#ifdef _AIX
-    constexpr int NotSupportedError = ENOTSUP;
-#else
-    constexpr int NotSupportedError = EOPNOTSUPP;
-#endif
-    if (Err != EINVAL && Err != NotSupportedError)
-      return std::error_code(Err, std::generic_category());
-  }
-#endif
   // Use ftruncate as a fallback. It may or may not allocate space. At least on
   // OS X with HFS+ it does.
   if (::ftruncate(FD, Size) == -1)
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 9bf0384b5f1b..bec4e8dbe06c 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -75,6 +75,12 @@ Status::Status(const Twine &Name, UniqueID UID, sys::TimePoint<> MTime,
     : Name(Name.str()), UID(UID), MTime(MTime), User(User), Group(Group),
       Size(Size), Type(Type), Perms(Perms) {}
 
+Status Status::copyWithNewSize(const Status &In, uint64_t NewSize) {
+  return Status(In.getName(), In.getUniqueID(), In.getLastModificationTime(),
+                In.getUser(), In.getGroup(), NewSize, In.getType(),
+                In.getPermissions());
+}
+
 Status Status::copyWithNewName(const Status &In, const Twine &NewName) {
   return Status(NewName, In.getUniqueID(), In.getLastModificationTime(),
                 In.getUser(), In.getGroup(), In.getSize(), In.getType(),
diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp
index f68ba0d065c1..2adf37a511d1 100644
--- a/llvm/lib/Support/YAMLParser.cpp
+++ b/llvm/lib/Support/YAMLParser.cpp
@@ -1876,8 +1876,8 @@ document_iterator Stream::end() {
 }
 
 void Stream::skip() {
-  for (document_iterator i = begin(), e = end(); i != e; ++i)
-    i->skip();
+  for (Document &Doc : *this)
+    Doc.skip();
 }
 
 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
diff --git a/llvm/lib/TableGen/StringMatcher.cpp b/llvm/lib/TableGen/StringMatcher.cpp
index 7f30c7b60752..7474c5dfe885 100644
--- a/llvm/lib/TableGen/StringMatcher.cpp
+++ b/llvm/lib/TableGen/StringMatcher.cpp
@@ -32,8 +32,8 @@ FindFirstNonCommonLetter(const std::vector<const
     // Check to see if letter i is the same across the set.
     char Letter = Matches[0]->first[i];
 
-    for (unsigned str = 0, e = Matches.size(); str != e; ++str)
-      if (Matches[str]->first[i] != Letter)
+    for (const StringMatcher::StringPair *Match : Matches)
+      if (Match->first[i] != Letter)
         return i;
   }
 
@@ -75,9 +75,8 @@ bool StringMatcher::EmitStringMatcherForChar(
   // Bucket the matches by the character we are comparing.
   std::map<char, std::vector<const StringPair*>> MatchesByLetter;
 
-  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
-    MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]);
-
+  for (const StringPair *Match : Matches)
+    MatchesByLetter[Match->first[CharNo]].push_back(Match);
 
   // If we have exactly one bucket to match, see how many characters are common
   // across the whole set and match all of them at once.
@@ -135,8 +134,8 @@ void StringMatcher::Emit(unsigned Indent, bool IgnoreDuplicates) const {
   // First level categorization: group strings by length.
   std::map<unsigned, std::vector<const StringPair*>> MatchesByLength;
 
-  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
-    MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]);
+  for (const StringPair &Match : Matches)
+    MatchesByLength[Match.first.size()].push_back(&Match);
 
   // Output a switch statement on length and categorize the elements within each
   // bin.
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 548e4e0c9389..cb17fd94c335 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -455,6 +455,9 @@ def FeatureEL2VMSA : SubtargetFeature<"el2vmsa", "HasEL2VMSA", "true",
 def FeatureEL3 : SubtargetFeature<"el3", "HasEL3", "true",
   "Enable Exception Level 3">;
 
+def FeatureFixCortexA53_835769 : SubtargetFeature<"fix-cortex-a53-835769",
+  "FixCortexA53_835769", "true", "Mitigate Cortex-A53 Erratum 835769">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
diff --git a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
index 7fd51a98ad94..4cdf5f144437 100644
--- a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
+#include "AArch64Subtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -116,8 +117,13 @@ INITIALIZE_PASS(AArch64A53Fix835769, "aarch64-fix-cortex-a53-835769-pass",
 bool
 AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) {
   LLVM_DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n");
+  auto &STI = F.getSubtarget<AArch64Subtarget>();
+  // Fix not requested, skip pass.
+  if (!STI.fixCortexA53_835769())
+    return false;
+
   bool Changed = false;
-  TII = F.getSubtarget().getInstrInfo();
+  TII = STI.getInstrInfo();
 
   for (auto &MBB : F) {
     Changed |= runOnBasicBlock(MBB);
diff --git a/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index cd67e058a9c1..9e31243cd696 100644
--- a/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -398,8 +398,8 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
   TII = mf.getSubtarget().getInstrInfo();
 
   // Just check things on a one-block-at-a-time basis.
-  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
-    if (processMachineBasicBlock(&*I))
+  for (MachineBasicBlock &MBB : mf)
+    if (processMachineBasicBlock(&MBB))
       Changed = true;
   return Changed;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index aeebb49675b2..85a9c04a3fef 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -73,6 +73,7 @@ class AArch64AsmPrinter : public AsmPrinter {
   StackMaps SM;
   FaultMaps FM;
   const AArch64Subtarget *STI;
+  bool ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = false;
 
 public:
   AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
@@ -186,6 +187,10 @@ private:
   using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
 
   MInstToMCSymbol LOHInstToLabel;
+
+  bool shouldEmitWeakSwiftAsyncExtendedFramePointerFlags() const override {
+    return ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags;
+  }
 };
 
 } // end anonymous namespace
@@ -1132,6 +1137,15 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
     return;
 
+  if (MI->getOpcode() == AArch64::ADRP) {
+    for (auto &Opd : MI->operands()) {
+      if (Opd.isSymbol() && StringRef(Opd.getSymbolName()) ==
+                                "swift_async_extendedFramePointerFlags") {
+        ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = true;
+      }
+    }
+  }
+
   if (AArch64FI->getLOHRelated().count(MI)) {
     // Generate a label for LOH related instruction
     MCSymbol *LOHLabel = createTempSymbol("loh");
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index d2097f7e6ee3..1994e0eb7fb9 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -196,6 +196,13 @@ def mutate_anyext_to_zext : GICombineRule<
   (apply [{ applyMutateAnyExtToZExt(*${d}, MRI, B, Observer); }])
 >;
 
+def split_store_zero_128 : GICombineRule<
+  (defs root:$d),
+  (match (wip_match_opcode G_STORE):$d,
+          [{ return matchSplitStoreZero128(*${d}, MRI); }]),
+  (apply [{ applySplitStoreZero128(*${d}, MRI, B, Observer); }])
+>;
+
 // Post-legalization combines which should happen at all optimization levels.
 // (E.g. ones that facilitate matching for the selector) For example, matching
 // pseudos.
@@ -220,6 +227,7 @@ def AArch64PostLegalizerCombinerHelper
                         icmp_to_true_false_known_bits, merge_unmerge,
                         select_combines, fold_merge_to_zext,
                         constant_fold, identity_combines,
-                        ptr_add_immed_chain, overlapping_and]> {
+                        ptr_add_immed_chain, overlapping_and,
+                        split_store_zero_128]> {
   let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
index d98a5cfd4f50..4f324198f3dc 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
@@ -51,10 +51,9 @@ static bool tryToreplicateChunks(uint64_t UImm,
     ++Counts[getChunk(UImm, Idx)];
 
   // Traverse the chunks to find one which occurs more than once.
-  for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
-       Chunk != End; ++Chunk) {
-    const uint64_t ChunkVal = Chunk->first;
-    const unsigned Count = Chunk->second;
+  for (const auto &Chunk : Counts) {
+    const uint64_t ChunkVal = Chunk.first;
+    const unsigned Count = Chunk.second;
 
     uint64_t Encoding = 0;
 
diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index 209f9f7255a5..793663ef97d7 100644
--- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -138,8 +138,8 @@ bool FalkorMarkStridedAccesses::run() {
   bool MadeChange = false;
 
   for (Loop *L : LI)
-    for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt)
-      MadeChange |= runOnLoop(**LIt);
+    for (Loop *LIt : depth_first(L))
+      MadeChange |= runOnLoop(*LIt);
 
   return MadeChange;
 }
@@ -828,10 +828,10 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
   Modified = false;
 
   for (MachineLoop *I : LI)
-    for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
+    for (MachineLoop *L : depth_first(I))
       // Only process inner-loops
       if (L->isInnermost())
-        runOnLoop(**L, Fn);
+        runOnLoop(*L, Fn);
 
   return Modified;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index b630f4f0df5f..638e45b30d99 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3041,10 +3041,21 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
 
   // Create a buffer of SVE objects to allocate and sort it.
   SmallVector<int, 8> ObjectsToAllocate;
+  // If we have a stack protector, and we've previously decided that we have SVE
+  // objects on the stack and thus need it to go in the SVE stack area, then it
+  // needs to go first.
+  int StackProtectorFI = -1;
+  if (MFI.hasStackProtectorIndex()) {
+    StackProtectorFI = MFI.getStackProtectorIndex();
+    if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
+      ObjectsToAllocate.push_back(StackProtectorFI);
+  }
   for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
     unsigned StackID = MFI.getStackID(I);
     if (StackID != TargetStackID::ScalableVector)
       continue;
+    if (I == StackProtectorFI)
+      continue;
     if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
       continue;
     if (MFI.isDeadObjectIndex(I))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 72461aa1f772..e141179fb5c8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -204,6 +205,8 @@ static bool isMergePassthruOpcode(unsigned Opc) {
     return false;
   case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
   case AArch64ISD::BSWAP_MERGE_PASSTHRU:
+  case AArch64ISD::REVH_MERGE_PASSTHRU:
+  case AArch64ISD::REVW_MERGE_PASSTHRU:
   case AArch64ISD::CTLZ_MERGE_PASSTHRU:
   case AArch64ISD::CTPOP_MERGE_PASSTHRU:
   case AArch64ISD::DUP_MERGE_PASSTHRU:
@@ -2227,6 +2230,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::STNP)
     MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
@@ -4213,6 +4218,12 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_sve_revb:
     return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_revh:
+    return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+  case Intrinsic::aarch64_sve_revw:
+    return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
   case Intrinsic::aarch64_sve_sxtb:
     return DAG.getNode(
         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
@@ -10958,16 +10969,15 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
   EVT InVT = Op.getOperand(1).getValueType();
   unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
 
-  if (InVT.isScalableVector()) {
-    SDLoc DL(Op);
-    EVT VT = Op.getValueType();
+  SDValue Vec0 = Op.getOperand(0);
+  SDValue Vec1 = Op.getOperand(1);
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
 
+  if (InVT.isScalableVector()) {
     if (!isTypeLegal(VT))
       return SDValue();
 
-    SDValue Vec0 = Op.getOperand(0);
-    SDValue Vec1 = Op.getOperand(1);
-
     // Ensure the subvector is half the size of the main vector.
     if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
       return SDValue();
@@ -10997,9 +11007,18 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
     return SDValue();
   }
 
-  // This will be matched by custom code during ISelDAGToDAG.
-  if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
-    return Op;
+  if (Idx == 0 && isPackedVectorType(VT, DAG)) {
+    // This will be matched by custom code during ISelDAGToDAG.
+    if (Vec0.isUndef())
+      return Op;
+
+    unsigned int PredPattern =
+        getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
+    auto PredTy = VT.changeVectorElementType(MVT::i1);
+    SDValue PTrue = getPTrue(DAG, DL, PredTy, PredPattern);
+    SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
+    return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
+  }
 
   return SDValue();
 }
@@ -11794,6 +11813,9 @@ bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
       Base.getOperand(1).getOpcode() == ISD::SHL &&
       Base.getOperand(1).hasOneUse() &&
       Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
+    // It's unknown whether a scalable vector has a power-of-2 bitwidth.
+    if (Mem->getMemoryVT().isScalableVector())
+      return false;
     // The shift can be combined if it matches the size of the value being
     // loaded (and so reducing the width would make it not match).
     uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
@@ -15820,6 +15842,23 @@ static SDValue performVectorShiftCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
+  // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
+  // This transform works in partnership with performSetCCPunpkCombine to
+  // remove unnecessary transfer of predicates into standard registers and back
+  if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
+      N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
+          MVT::i1) {
+    SDValue CC = N->getOperand(0)->getOperand(0);
+    auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
+    SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
+                               DAG.getVectorIdxConstant(0, SDLoc(N)));
+    return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
+  }
+
+  return SDValue();
+}
+
 /// Target-specific DAG combine function for post-increment LD1 (lane) and
 /// post-increment LD1R.
 static SDValue performPostLD1Combine(SDNode *N,
@@ -15982,7 +16021,9 @@ static SDValue performSTORECombine(SDNode *N,
   if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
       Value.getNode()->hasOneUse() && ST->isUnindexed() &&
       Subtarget->useSVEForFixedLengthVectors() &&
-      Value.getValueType().isFixedLengthVector())
+      Value.getValueType().isFixedLengthVector() &&
+      Value.getValueType().getFixedSizeInBits() >=
+          Subtarget->getMinSVEVectorSizeInBits())
     return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
                              ST->getMemoryVT(), ST->getMemOperand());
 
@@ -16495,6 +16536,44 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
+  // setcc_merge_zero pred
+  //   (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
+  //   => extract_subvector (inner setcc_merge_zero)
+  SDValue Pred = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
+
+  if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
+      LHS->getOpcode() != ISD::SIGN_EXTEND)
+    return SDValue();
+
+  SDValue Extract = LHS->getOperand(0);
+  if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+      Extract->getValueType(0) != N->getValueType(0) ||
+      Extract->getConstantOperandVal(1) != 0)
+    return SDValue();
+
+  SDValue InnerSetCC = Extract->getOperand(0);
+  if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
+    return SDValue();
+
+  // By this point we've effectively got
+  // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
+  // lanes are already zero then the trunc(sext()) sequence is redundant and we
+  // can operate on A directly.
+  SDValue InnerPred = InnerSetCC.getOperand(0);
+  if (Pred.getOpcode() == AArch64ISD::PTRUE &&
+      InnerPred.getOpcode() == AArch64ISD::PTRUE &&
+      Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
+      Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
+      Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
+    return Extract;
+
+  return SDValue();
+}
+
 static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
          "Unexpected opcode!");
@@ -16513,6 +16592,9 @@ static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) {
       LHS->getOperand(0)->getOperand(0) == Pred)
     return LHS->getOperand(0);
 
+  if (SDValue V = performSetCCPunpkCombine(N, DAG))
+    return V;
+
   return SDValue();
 }
 
@@ -17343,7 +17425,8 @@ SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
   // they can be split down into something legal.
   if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
       N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
-      VT.isFixedLengthVector()) {
+      VT.isFixedLengthVector() &&
+      VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(), LN0->getBasePtr(),
@@ -17455,6 +17538,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case AArch64ISD::VASHR:
   case AArch64ISD::VLSHR:
     return performVectorShiftCombine(N, *this, DCI);
+  case AArch64ISD::SUNPKLO:
+    return performSunpkloCombine(N, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return performInsertVectorEltCombine(N, DCI);
   case ISD::EXTRACT_VECTOR_ELT:
@@ -18570,7 +18655,25 @@ AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
 }
 
 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
-  MF.getFrameInfo().computeMaxCallFrameSize(MF);
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  // If we have any vulnerable SVE stack objects then the stack protector
+  // needs to be placed at the top of the SVE stack area, as the SVE locals
+  // are placed above the other locals, so we allocate it as if it were a
+  // scalable vector.
+  // FIXME: It may be worthwhile having a specific interface for this rather
+  // than doing it here in finalizeLowering.
+  if (MFI.hasStackProtectorIndex()) {
+    for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+      if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
+          MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
+        MFI.setStackID(MFI.getStackProtectorIndex(),
+                       TargetStackID::ScalableVector);
+        MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));
+        break;
+      }
+    }
+  }
+  MFI.computeMaxCallFrameSize(MF);
   TargetLoweringBase::finalizeLowering(MF);
 }
 
@@ -18855,10 +18958,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
 
 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
     SDValue Op, SelectionDAG &DAG) const {
-  auto Store = cast<MaskedStoreSDNode>(Op);
-
-  if (Store->isTruncatingStore())
-    return SDValue();
+  auto *Store = cast<MaskedStoreSDNode>(Op);
 
   SDLoc DL(Op);
   EVT VT = Store->getValue().getValueType();
@@ -19103,7 +19203,7 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
   if (isMergePassthruOpcode(NewOp))
     Operands.push_back(DAG.getUNDEF(VT));
 
-  return DAG.getNode(NewOp, DL, VT, Operands);
+  return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
 }
 
 // If a fixed length vector operation has no side effects when applied to
@@ -19498,6 +19598,94 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
     return convertFromScalableVector(DAG, VT, Op);
   }
 
+  for (unsigned LaneSize : {64U, 32U, 16U}) {
+    if (isREVMask(ShuffleMask, VT, LaneSize)) {
+      EVT NewVT =
+          getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize));
+      unsigned RevOp;
+      unsigned EltSz = VT.getScalarSizeInBits();
+      if (EltSz == 8)
+        RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
+      else if (EltSz == 16)
+        RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
+      else
+        RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
+
+      Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
+      Op = LowerToPredicatedOp(Op, DAG, RevOp);
+      Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
+      return convertFromScalableVector(DAG, VT, Op);
+    }
+  }
+
+  unsigned WhichResult;
+  if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
+    return convertFromScalableVector(
+        DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
+
+  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return convertFromScalableVector(
+        DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
+  }
+
+  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
+    return convertFromScalableVector(
+        DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
+
+  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return convertFromScalableVector(
+        DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
+  }
+
+  // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
+  // represents the same logical operation as performed by a ZIP instruction. In
+  // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
+  // equivalent to an AArch64 instruction. There's the extra component of
+  // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
+  // only operated on 64/128bit vector types that have a direct mapping to a
+  // target register and so an exact mapping is implied.
+  // However, when using SVE for fixed length vectors, most legal vector types
+  // are actually sub-vectors of a larger SVE register. When mapping
+  // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
+  // how the mask's indices translate. Specifically, when the mapping requires
+  // an exact meaning for a specific vector index (e.g. Index X is the last
+  // vector element in the register) then such mappings are often only safe when
+  // the exact SVE register size is know. The main exception to this is when
+  // indices are logically relative to the first element of either
+  // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
+  // when converting from fixed-length to scalable vector types (i.e. the start
+  // of a fixed length vector is always the start of a scalable vector).
+  unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
+  unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
+  if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
+    if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) {
+      Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
+      return convertFromScalableVector(DAG, VT, Op);
+    }
+
+    if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
+      return convertFromScalableVector(
+          DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
+
+    if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+      unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+      return convertFromScalableVector(
+          DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
+    }
+
+    if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
+      return convertFromScalableVector(
+          DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
+
+    if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+      unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+      return convertFromScalableVector(
+          DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ea884cdccd28..367ba3039a0c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -324,6 +324,8 @@ enum NodeType : unsigned {
 
   BITREVERSE_MERGE_PASSTHRU,
   BSWAP_MERGE_PASSTHRU,
+  REVH_MERGE_PASSTHRU,
+  REVW_MERGE_PASSTHRU,
   CTLZ_MERGE_PASSTHRU,
   CTPOP_MERGE_PASSTHRU,
   DUP_MERGE_PASSTHRU,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index cd4bc8a61a8a..f8d492188744 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -387,16 +387,16 @@ def simm7s16 : Operand<i32> {
   let PrintMethod = "printImmScale<16>";
 }
 
-def am_sve_fi : ComplexPattern<i64, 2, "SelectAddrModeFrameIndexSVE", []>;
+def am_sve_fi : ComplexPattern<iPTR, 2, "SelectAddrModeFrameIndexSVE", []>;
 
-def am_indexed7s8   : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
-def am_indexed7s16  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
-def am_indexed7s32  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
-def am_indexed7s64  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
-def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
+def am_indexed7s8   : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed7S8", []>;
+def am_indexed7s16  : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed7S16", []>;
+def am_indexed7s32  : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed7S32", []>;
+def am_indexed7s64  : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed7S64", []>;
+def am_indexed7s128 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed7S128", []>;
 
-def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
-def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
+def am_indexedu6s128 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedU6S128", []>;
+def am_indexeds9s128 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedS9S128", []>;
 
 def UImmS1XForm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
@@ -3177,18 +3177,18 @@ def maski16_or_more : Operand<i32>,
 
 // (unsigned immediate)
 // Indexed for 8-bit registers. offset is in range [0,4095].
-def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
-def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
-def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
-def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
-def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
+def am_indexed8 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed8", []>;
+def am_indexed16 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed16", []>;
+def am_indexed32 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed32", []>;
+def am_indexed64 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed64", []>;
+def am_indexed128 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed128", []>;
 
 // (unsigned immediate)
 // Indexed for 8-bit registers. offset is in range [0,63].
-def am_indexed8_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<1,63>", []>;
-def am_indexed16_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<2,63>", []>;
-def am_indexed32_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<4,63>", []>;
-def am_indexed64_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<8,63>", []>;
+def am_indexed8_6b : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedUImm<1,63>", []>;
+def am_indexed16_6b : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedUImm<2,63>", []>;
+def am_indexed32_6b : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedUImm<4,63>", []>;
+def am_indexed64_6b : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedUImm<8,63>", []>;
 
 def gi_am_indexed8 :
     GIComplexOperandMatcher<s64, "selectAddrModeIndexed<8>">,
@@ -3358,11 +3358,11 @@ class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
 // Load/store register offset
 //---
 
-def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
-def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
-def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
-def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
-def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
+def ro_Xindexed8 : ComplexPattern<iPTR, 4, "SelectAddrModeXRO<8>", []>;
+def ro_Xindexed16 : ComplexPattern<iPTR, 4, "SelectAddrModeXRO<16>", []>;
+def ro_Xindexed32 : ComplexPattern<iPTR, 4, "SelectAddrModeXRO<32>", []>;
+def ro_Xindexed64 : ComplexPattern<iPTR, 4, "SelectAddrModeXRO<64>", []>;
+def ro_Xindexed128 : ComplexPattern<iPTR, 4, "SelectAddrModeXRO<128>", []>;
 
 def gi_ro_Xindexed8 :
     GIComplexOperandMatcher<s64, "selectAddrModeXRO<8>">,
@@ -3380,11 +3380,11 @@ def gi_ro_Xindexed128 :
     GIComplexOperandMatcher<s64, "selectAddrModeXRO<128>">,
     GIComplexPatternEquiv<ro_Xindexed128>;
 
-def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
-def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
-def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
-def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
-def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
+def ro_Windexed8 : ComplexPattern<iPTR, 4, "SelectAddrModeWRO<8>", []>;
+def ro_Windexed16 : ComplexPattern<iPTR, 4, "SelectAddrModeWRO<16>", []>;
+def ro_Windexed32 : ComplexPattern<iPTR, 4, "SelectAddrModeWRO<32>", []>;
+def ro_Windexed64 : ComplexPattern<iPTR, 4, "SelectAddrModeWRO<64>", []>;
+def ro_Windexed128 : ComplexPattern<iPTR, 4, "SelectAddrModeWRO<128>", []>;
 
 def gi_ro_Windexed8 :
     GIComplexOperandMatcher<s64, "selectAddrModeWRO<8>">,
@@ -3880,11 +3880,11 @@ multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
 // Load/store unscaled immediate
 //---
 
-def am_unscaled8 :  ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
-def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
-def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
-def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
-def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
+def am_unscaled8 :  ComplexPattern<iPTR, 2, "SelectAddrModeUnscaled8", []>;
+def am_unscaled16 : ComplexPattern<iPTR, 2, "SelectAddrModeUnscaled16", []>;
+def am_unscaled32 : ComplexPattern<iPTR, 2, "SelectAddrModeUnscaled32", []>;
+def am_unscaled64 : ComplexPattern<iPTR, 2, "SelectAddrModeUnscaled64", []>;
+def am_unscaled128 :ComplexPattern<iPTR, 2, "SelectAddrModeUnscaled128", []>;
 
 def gi_am_unscaled8 :
     GIComplexOperandMatcher<s64, "selectAddrModeUnscaled8">,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index f8f8ee3f1e6c..5fc5e4e5eb35 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7055,6 +7055,8 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
 
 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
                                               unsigned &Flags) const {
+  if (!TargetInstrInfo::isMBBSafeToOutlineFrom(MBB, Flags))
+    return false;
   // Check if LR is available through all of the MBB. If it's not, then set
   // a flag.
   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index decee117d2d5..ebccc07edc7a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4174,6 +4174,21 @@ defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
 defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
 defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
 
+def : Pat<(v8i8 (AArch64vashr (v8i8 V64:$Rn), (i32 7))),
+          (CMLTv8i8rz V64:$Rn)>;
+def : Pat<(v4i16 (AArch64vashr (v4i16 V64:$Rn), (i32 15))),
+          (CMLTv4i16rz V64:$Rn)>;
+def : Pat<(v2i32 (AArch64vashr (v2i32 V64:$Rn), (i32 31))),
+          (CMLTv2i32rz V64:$Rn)>;
+def : Pat<(v16i8 (AArch64vashr (v16i8 V128:$Rn), (i32 7))),
+          (CMLTv16i8rz V128:$Rn)>;
+def : Pat<(v8i16 (AArch64vashr (v8i16 V128:$Rn), (i32 15))),
+          (CMLTv8i16rz V128:$Rn)>;
+def : Pat<(v4i32 (AArch64vashr (v4i32 V128:$Rn), (i32 31))),
+          (CMLTv4i32rz V128:$Rn)>;
+def : Pat<(v2i64 (AArch64vashr (v2i64 V128:$Rn), (i32 63))),
+          (CMLTv2i64rz V128:$Rn)>;
+
 defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
 defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
 defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
@@ -4363,6 +4378,32 @@ def : Pat<(v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)),
                               (v4i32 VImm8000)))),
           (SQXTNv4i16 V128:$Vn)>;
 
+// concat_vectors(Vd, trunc(smin(smax Vm, -128), 127) ~> SQXTN2(Vd, Vn)
+// with reversed min/max
+def : Pat<(v16i8 (concat_vectors
+                 (v8i8 V64:$Vd),
+                 (v8i8 (trunc (smin (smax (v8i16 V128:$Vn), (v8i16 VImm80)),
+                                          (v8i16 VImm7F)))))),
+          (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
+def : Pat<(v16i8 (concat_vectors
+                 (v8i8 V64:$Vd),
+                 (v8i8 (trunc (smax (smin (v8i16 V128:$Vn), (v8i16 VImm7F)),
+                                          (v8i16 VImm80)))))),
+          (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
+
+// concat_vectors(Vd, trunc(smin(smax Vm, -32768), 32767) ~> SQXTN2(Vd, Vn)
+// with reversed min/max
+def : Pat<(v8i16 (concat_vectors
+                 (v4i16 V64:$Vd),
+                 (v4i16 (trunc (smin (smax (v4i32 V128:$Vn), (v4i32 VImm8000)),
+                                           (v4i32 VImm7FFF)))))),
+          (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
+def : Pat<(v8i16 (concat_vectors
+                 (v4i16 V64:$Vd),
+                 (v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)),
+                                           (v4i32 VImm8000)))))),
+          (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
+
 //===----------------------------------------------------------------------===//
 // Advanced SIMD three vector instructions.
 //===----------------------------------------------------------------------===//
@@ -4825,6 +4866,9 @@ defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar
 defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
                                     int_aarch64_neon_usqadd>;
 
+def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))),
+          (CMLTv1i64rz V64:$Rn)>;
+
 def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
           (FCVTASv1i64 FPR64:$Rn)>;
 def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
@@ -5288,6 +5332,29 @@ defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
 defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
 defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
 
+def : Pat<(v16i8 (concat_vectors (v8i8 (trunc (v8i16 V128:$Vn))),
+                                 (v8i8 (trunc (v8i16 V128:$Vm))))),
+          (UZP1v16i8 V128:$Vn, V128:$Vm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 (trunc (v4i32 V128:$Vn))),
+                                 (v4i16 (trunc (v4i32 V128:$Vm))))),
+          (UZP1v8i16 V128:$Vn, V128:$Vm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))),
+                                 (v2i32 (trunc (v2i64 V128:$Vm))))),
+          (UZP1v4i32 V128:$Vn, V128:$Vm)>;
+
+def : Pat<(v16i8 (concat_vectors
+                 (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))),
+                 (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vm), (i32 8)))))),
+          (UZP2v16i8 V128:$Vn, V128:$Vm)>;
+def : Pat<(v8i16 (concat_vectors
+                 (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vn), (i32 16)))),
+                 (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vm), (i32 16)))))),
+          (UZP2v8i16 V128:$Vn, V128:$Vm)>;
+def : Pat<(v4i32 (concat_vectors
+                 (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vn), (i32 32)))),
+                 (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))),
+          (UZP2v4i32 V128:$Vn, V128:$Vm)>;
+
 //----------------------------------------------------------------------------
 // AdvSIMD TBL/TBX instructions
 //----------------------------------------------------------------------------
@@ -6536,6 +6603,34 @@ defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
 defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
                 TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
 
+// RADDHN patterns for when RSHRN shifts by half the size of the vector element
+def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))),
+          (RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>;
+def : Pat<(v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))),
+          (RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>;
+def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))),
+          (RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>;
+
+// RADDHN2 patterns for when RSHRN shifts by half the size of the vector element
+def : Pat<(v16i8 (concat_vectors
+                 (v8i8 V64:$Vd),
+                 (v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))))),
+          (RADDHNv8i16_v16i8
+                 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
+                 (v8i16 (MOVIv2d_ns (i32 0))))>;
+def : Pat<(v8i16 (concat_vectors
+                 (v4i16 V64:$Vd),
+                 (v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))))),
+          (RADDHNv4i32_v8i16
+                 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
+                 (v4i32 (MOVIv2d_ns (i32 0))))>;
+def : Pat<(v4i32 (concat_vectors
+                 (v2i32 V64:$Vd),
+                 (v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))))),
+          (RADDHNv2i64_v4i32
+                 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
+                 (v2i64 (MOVIv2d_ns (i32 0))))>;
+
 // SHRN patterns for when a logical right shift was used instead of arithmetic
 // (the immediate guarantees no sign bits actually end up in the result so it
 // doesn't matter).
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 25d53f4ab065..eb55a472a69a 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -136,15 +136,15 @@ def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER
 //
 
 // SVE CNT/INC/RDVL
-def sve_rdvl_imm : ComplexPattern<i32, 1, "SelectRDVLImm<-32, 31, 16>">;
-def sve_cnth_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 8>">;
-def sve_cntw_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 4>">;
-def sve_cntd_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 2>">;
+def sve_rdvl_imm : ComplexPattern<i64, 1, "SelectRDVLImm<-32, 31, 16>">;
+def sve_cnth_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 8>">;
+def sve_cntw_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 4>">;
+def sve_cntd_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 2>">;
 
 // SVE DEC
-def sve_cnth_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -8>">;
-def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">;
-def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">;
+def sve_cnth_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -8>">;
+def sve_cntw_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -4>">;
+def sve_cntd_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -2>">;
 
 def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>;
 def AArch64faddv_p   : SDNode<"AArch64ISD::FADDV_PRED",   SDT_AArch64Reduce>;
@@ -231,6 +231,8 @@ def AArch64fsqrt_mt  : SDNode<"AArch64ISD::FSQRT_MERGE_PASSTHRU", SDT_AArch64Ari
 def AArch64frecpx_mt : SDNode<"AArch64ISD::FRECPX_MERGE_PASSTHRU", SDT_AArch64Arith>;
 def AArch64rbit_mt   : SDNode<"AArch64ISD::BITREVERSE_MERGE_PASSTHRU", SDT_AArch64Arith>;
 def AArch64revb_mt   : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64revh_mt   : SDNode<"AArch64ISD::REVH_MERGE_PASSTHRU", SDT_AArch64Arith>;
+def AArch64revw_mt   : SDNode<"AArch64ISD::REVW_MERGE_PASSTHRU", SDT_AArch64Arith>;
 
 // These are like the above but we don't yet have need for ISD nodes. They allow
 // a single pattern to match intrinsic and ISD operand layouts.
@@ -275,6 +277,11 @@ def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2),
   return N->hasOneUse();
 }]>;
 
+def AArch64fneg_mt_nsz : PatFrag<(ops node:$pred, node:$op, node:$pt),
+                                  (AArch64fneg_mt node:$pred, node:$op, node:$pt), [{
+  return N->getFlags().hasNoSignedZeros();
+}]>;
+
 def SDT_AArch64Arith_Unpred : SDTypeProfile<1, 2, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>,
   SDTCisSameAs<0,1>, SDTCisSameAs<1,2>
@@ -536,7 +543,8 @@ let Predicates = [HasSVEorStreamingSVE] in {
               (!cast<Instruction>("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
 
     // Zd = -(Za + Zn * Zm)
-    def : Pat<(AArch64fneg_mt PredTy:$P, (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za), (Ty (undef))),
+    // (with nsz neg.)
+    def : Pat<(AArch64fneg_mt_nsz PredTy:$P, (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za), (Ty (undef))),
               (!cast<Instruction>("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>;
 
     // Zda = Zda + Zn * Zm
@@ -624,13 +632,13 @@ let Predicates = [HasSVEorStreamingSVE] in {
   def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
 
   // Duplicate Int immediate into all vector elements
-  def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
+  def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))),
             (DUP_ZI_B $a, $b)>;
-  def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
+  def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))),
             (DUP_ZI_H $a, $b)>;
-  def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
+  def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))),
             (DUP_ZI_S $a, $b)>;
-  def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))),
+  def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm64 i32:$a, i32:$b)))),
             (DUP_ZI_D $a, $b)>;
 
   // Duplicate immediate FP into all vector elements.
@@ -674,8 +682,8 @@ let Predicates = [HasSVEorStreamingSVE] in {
 
   defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", AArch64rbit_mt>;
   defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", AArch64revb_mt>;
-  defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
-  defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>;
+  defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", AArch64revh_mt>;
+  defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", AArch64revw_mt>;
 
   defm REV_PP : sve_int_perm_reverse_p<"rev", vector_reverse>;
   defm REV_ZZ : sve_int_perm_reverse_z<"rev", vector_reverse>;
@@ -2686,13 +2694,13 @@ let Predicates = [HasSVEorStreamingSVE] in {
 
   // Splice with lane bigger or equal to 0
   def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_255 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_255:$index)>;
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
   def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_127 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_127:$index)>;
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
   def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_63 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_63:$index)>;
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
   def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_31 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_31:$index)>;
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
 
 } // End HasSVEorStreamingSVE
 
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 5cec4cb66339..566c7a16db23 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -488,7 +488,7 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer(
 
 void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
   const Align NewAlignment =
-      max(MaybeAlign(Info.AI->getAlignment()), kTagGranuleSize);
+      max(MaybeAlign(Info.AI->getAlign()), kTagGranuleSize);
   Info.AI->setAlignment(NewAlignment);
 
   uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8;
@@ -537,15 +537,14 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   SmallVector<Instruction *, 4> UnrecognizedLifetimes;
 
   for (auto &BB : *F) {
-    for (BasicBlock::iterator IT = BB.begin(); IT != BB.end(); ++IT) {
-      Instruction *I = &*IT;
-      if (auto *AI = dyn_cast<AllocaInst>(I)) {
+    for (Instruction &I : BB) {
+      if (auto *AI = dyn_cast<AllocaInst>(&I)) {
         Allocas[AI].AI = AI;
         Allocas[AI].OldAI = AI;
         continue;
       }
 
-      if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(I)) {
+      if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
         for (Value *V : DVI->location_ops())
           if (auto *AI = dyn_cast_or_null<AllocaInst>(V))
             if (Allocas[AI].DbgVariableIntrinsics.empty() ||
@@ -554,12 +553,12 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
         continue;
       }
 
-      auto *II = dyn_cast<IntrinsicInst>(I);
+      auto *II = dyn_cast<IntrinsicInst>(&I);
       if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start ||
                  II->getIntrinsicID() == Intrinsic::lifetime_end)) {
         AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
         if (!AI) {
-          UnrecognizedLifetimes.push_back(I);
+          UnrecognizedLifetimes.push_back(&I);
           continue;
         }
         if (II->getIntrinsicID() == Intrinsic::lifetime_start)
@@ -568,8 +567,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
           Allocas[AI].LifetimeEnd.push_back(II);
       }
 
-      if (isa<ReturnInst>(I) || isa<ResumeInst>(I) || isa<CleanupReturnInst>(I))
-        RetVec.push_back(I);
+      if (isa<ReturnInst, ResumeInst, CleanupReturnInst>(&I))
+        RetVec.push_back(&I);
     }
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index d782d6352cbe..f7d3dd0bc222 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -346,9 +346,7 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
     return false;
 
   if (TargetTriple.isiOS()) {
-    unsigned Major, Minor, Micro;
-    TargetTriple.getiOSVersion(Major, Minor, Micro);
-    return Major >= 8;
+    return TargetTriple.getiOSVersion() >= VersionTuple(8);
   }
 
   return false;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 19db774ccd7b..b3cd5ebd5f65 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -116,6 +116,8 @@ protected:
   bool HasFP16FML = false;
   bool HasSPE = false;
 
+  bool FixCortexA53_835769 = false;
+
   // ARMv8.1 extensions
   bool HasVH = false;
   bool HasPAN = false;
@@ -571,6 +573,8 @@ public:
   bool hasEL2VMSA() const { return HasEL2VMSA; }
   bool hasEL3() const { return HasEL3; }
 
+  bool fixCortexA53_835769() const { return FixCortexA53_835769; }
+
   bool addrSinkUsingGEPs() const override {
     // Keeping GEPs inbounds is important for exploiting AArch64
     // addressing-modes in ILP32 mode.
@@ -632,8 +636,7 @@ public:
     // extended frames should be flagged as present.
     const Triple &TT = getTargetTriple();
 
-    unsigned Major, Minor, Micro;
-    TT.getOSVersion(Major, Minor, Micro);
+    unsigned Major = TT.getOSVersion().getMajor();
     switch(TT.getOS()) {
     default:
       return false;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index ce26c62af61a..4af28fc070dd 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -117,11 +117,6 @@ static cl::opt<bool>
                   cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
-EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden,
-                cl::desc("Work around Cortex-A53 erratum 835769"),
-                cl::init(false));
-
-static cl::opt<bool>
     EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
                  cl::desc("Enable optimizations on complex GEPs"),
                  cl::init(false));
@@ -382,10 +377,9 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   unsigned MaxSVEVectorSize = 0;
   Attribute VScaleRangeAttr = F.getFnAttribute(Attribute::VScaleRange);
   if (VScaleRangeAttr.isValid()) {
-    std::tie(MinSVEVectorSize, MaxSVEVectorSize) =
-        VScaleRangeAttr.getVScaleRangeArgs();
-    MinSVEVectorSize *= 128;
-    MaxSVEVectorSize *= 128;
+    Optional<unsigned> VScaleMax = VScaleRangeAttr.getVScaleRangeMax();
+    MinSVEVectorSize = VScaleRangeAttr.getVScaleRangeMin() * 128;
+    MaxSVEVectorSize = VScaleMax ? VScaleMax.getValue() * 128 : 0;
   } else {
     MinSVEVectorSize = SVEVectorBitsMinOpt;
     MaxSVEVectorSize = SVEVectorBitsMaxOpt;
@@ -765,8 +759,7 @@ void AArch64PassConfig::addPreEmitPass() {
   if (TM->getOptLevel() >= CodeGenOpt::Aggressive && EnableLoadStoreOpt)
     addPass(createAArch64LoadStoreOptimizationPass());
 
-  if (EnableA53Fix835769)
-    addPass(createAArch64A53Fix835769());
+  addPass(createAArch64A53Fix835769());
 
   if (EnableBranchTargets)
     addPass(createAArch64BranchTargetsPass());
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 34015d2dbd49..d21854e38f5a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -30,6 +30,12 @@ using namespace llvm::PatternMatch;
 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
                                                cl::init(true), cl::Hidden);
 
+static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
+                                           cl::Hidden);
+
+static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
+                                            cl::init(10), cl::Hidden);
+
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                          const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -725,6 +731,22 @@ static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC,
   return IC.replaceInstUsesWith(II, FMLA);
 }
 
+static bool isAllActivePredicate(Value *Pred) {
+  // Look through convert.from.svbool(convert.to.svbool(...) chain.
+  Value *UncastedPred;
+  if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
+                      m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
+                          m_Value(UncastedPred)))))
+    // If the predicate has the same or less lanes than the uncasted
+    // predicate then we know the casting has no effect.
+    if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
+        cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
+      Pred = UncastedPred;
+
+  return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
+                         m_ConstantInt<AArch64SVEPredPattern::all>()));
+}
+
 static Optional<Instruction *>
 instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
   IRBuilder<> Builder(II.getContext());
@@ -735,8 +757,7 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
   Type *VecTy = II.getType();
   Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
 
-  if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
-                      m_ConstantInt<AArch64SVEPredPattern::all>()))) {
+  if (isAllActivePredicate(Pred)) {
     LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
     return IC.replaceInstUsesWith(II, Load);
   }
@@ -758,8 +779,7 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
   Value *VecPtr =
       Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
 
-  if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
-                      m_ConstantInt<AArch64SVEPredPattern::all>()))) {
+  if (isAllActivePredicate(Pred)) {
     Builder.CreateStore(VecOp, VecPtr);
     return IC.eraseInstFromFunction(II);
   }
@@ -1008,6 +1028,40 @@ static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC,
   return None;
 }
 
+static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
+                                                  IntrinsicInst &II) {
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+  Type *Int32Ty = Builder.getInt32Ty();
+  Value *Pred = II.getOperand(0);
+  Value *Vec = II.getOperand(1);
+  Value *DivVec = II.getOperand(2);
+
+  Value *SplatValue = getSplatValue(DivVec);
+  ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
+  if (!SplatConstantInt)
+    return None;
+  APInt Divisor = SplatConstantInt->getValue();
+
+  if (Divisor.isPowerOf2()) {
+    Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
+    auto ASRD = Builder.CreateIntrinsic(
+        Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
+    return IC.replaceInstUsesWith(II, ASRD);
+  }
+  if (Divisor.isNegatedPowerOf2()) {
+    Divisor.negate();
+    Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
+    auto ASRD = Builder.CreateIntrinsic(
+        Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
+    auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
+                                       {ASRD->getType()}, {ASRD, Pred, ASRD});
+    return IC.replaceInstUsesWith(II, NEG);
+  }
+
+  return None;
+}
+
 Optional<Instruction *>
 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                      IntrinsicInst &II) const {
@@ -1068,6 +1122,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVELD1(IC, II, DL);
   case Intrinsic::aarch64_sve_st1:
     return instCombineSVEST1(IC, II, DL);
+  case Intrinsic::aarch64_sve_sdiv:
+    return instCombineSVESDIV(IC, II);
   }
 
   return None;
@@ -1746,7 +1802,7 @@ InstructionCost
 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                       Align Alignment, unsigned AddressSpace,
                                       TTI::TargetCostKind CostKind) {
-  if (!isa<ScalableVectorType>(Src))
+  if (useNeonVector(Src))
     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                         CostKind);
   auto LT = TLI->getTypeLegalizationCost(DL, Src);
@@ -1763,6 +1819,10 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
   return LT.first * 2;
 }
 
+static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
+  return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
+}
+
 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
@@ -1785,6 +1845,10 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
   ElementCount LegalVF = LT.second.getVectorElementCount();
   InstructionCost MemOpCost =
       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
+  // Add on an overhead cost for using gathers/scatters.
+  // TODO: At the moment this is applied unilaterally for all CPUs, but at some
+  // point we may want a per-CPU overhead.
+  MemOpCost *= getSVEGatherScatterOverhead(Opcode);
   return LT.first * MemOpCost * getMaxNumElements(LegalVF);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index d1e8cd204b3a..c3e1735cd4cd 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -309,6 +309,8 @@ public:
 
   bool supportsScalableVectors() const { return ST->hasSVE(); }
 
+  bool enableScalableVectorization() const { return ST->hasSVE(); }
+
   bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
                                    ElementCount VF) const;
 
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 6d3aea2721de..62038b10fccd 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1031,12 +1031,7 @@ public:
     if (DarwinRefKind != MCSymbolRefExpr::VK_None)
       return false;
 
-    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
-      if (ELFRefKind == AllowedModifiers[i])
-        return true;
-    }
-
-    return false;
+    return llvm::is_contained(AllowedModifiers, ELFRefKind);
   }
 
   bool isMovWSymbolG3() const {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 1524aa5eb0ec..e8894e7933d6 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -785,6 +785,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .libcallFor({s128})
       .minScalar(0, MinFPScalar);
 
+  // TODO: Vector types.
+  getActionDefinitionsBuilder({G_FMAXIMUM, G_FMINIMUM})
+      .legalFor({MinFPScalar, s32, s64})
+      .minScalar(0, MinFPScalar);
+
   // TODO: Libcall support for s128.
   // TODO: s16 should be legal with full FP16 support.
   getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index a9b3792e0118..3dec980a819a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -289,6 +289,44 @@ static void applyMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI,
   Observer.changedInstr(MI);
 }
 
+/// Match a 128b store of zero and split it into two 64 bit stores, for
+/// size/performance reasons.
+static bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  GStore &Store = cast<GStore>(MI);
+  if (!Store.isSimple())
+    return false;
+  LLT ValTy = MRI.getType(Store.getValueReg());
+  if (!ValTy.isVector() || ValTy.getSizeInBits() != 128)
+    return false;
+  if (ValTy.getSizeInBits() != Store.getMemSizeInBits())
+    return false; // Don't split truncating stores.
+  if (!MRI.hasOneNonDBGUse(Store.getValueReg()))
+    return false;
+  auto MaybeCst = isConstantOrConstantSplatVector(
+      *MRI.getVRegDef(Store.getValueReg()), MRI);
+  return MaybeCst && MaybeCst->isZero();
+}
+
+static void applySplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                   MachineIRBuilder &B,
+                                   GISelChangeObserver &Observer) {
+  B.setInstrAndDebugLoc(MI);
+  GStore &Store = cast<GStore>(MI);
+  assert(MRI.getType(Store.getValueReg()).isVector() &&
+         "Expected a vector store value");
+  LLT NewTy = LLT::scalar(64);
+  Register PtrReg = Store.getPointerReg();
+  auto Zero = B.buildConstant(NewTy, 0);
+  auto HighPtr = B.buildPtrAdd(MRI.getType(PtrReg), PtrReg,
+                               B.buildConstant(LLT::scalar(64), 8));
+  auto &MF = *MI.getMF();
+  auto *LowMMO = MF.getMachineMemOperand(&Store.getMMO(), 0, NewTy);
+  auto *HighMMO = MF.getMachineMemOperand(&Store.getMMO(), 8, NewTy);
+  B.buildStore(Zero, PtrReg, *LowMMO);
+  B.buildStore(Zero, HighPtr, *HighMMO);
+  Store.eraseFromParent();
+}
+
 #define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
 #include "AArch64GenPostLegalizeGICombiner.inc"
 #undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 40ddf6a94f73..515a5c63a559 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -430,6 +430,8 @@ static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) {
   case TargetOpcode::G_INTRINSIC_ROUND:
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FMINNUM:
+  case TargetOpcode::G_FMAXIMUM:
+  case TargetOpcode::G_FMINIMUM:
     return true;
   }
   return false;
@@ -600,6 +602,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMUL:
   case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_FMAXIMUM:
+  case TargetOpcode::G_FMINIMUM:
     return getSameKindOfOperandsMapping(MI);
   case TargetOpcode::G_FPEXT: {
     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 90688f1a3e83..c1186ae804d2 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -239,8 +239,8 @@ void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
       {codeview::RegisterId::ARM64_Q31, AArch64::Q31},
 
   };
-  for (unsigned I = 0; I < array_lengthof(RegMap); ++I)
-    MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg));
+  for (const auto &I : RegMap)
+    MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg));
 }
 
 static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 010ffa1502de..bb488cd7da32 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -197,34 +197,42 @@ def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64
 def SVEAddSubImm8Pat  : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8>", []>;
 def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", []>;
 def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32>", []>;
-def SVEAddSubImm64Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i64>", []>;
+def SVEAddSubImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64>", []>;
 
-def SVELogicalImm8Pat  : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i8>", []>;
-def SVELogicalImm16Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i16>", []>;
-def SVELogicalImm32Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i32>", []>;
+def SVELogicalImm8Pat  : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i8>", []>;
+def SVELogicalImm16Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16>", []>;
+def SVELogicalImm32Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32>", []>;
 def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>", []>;
 
-def SVELogicalImm8NotPat  : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i8, true>", []>;
-def SVELogicalImm16NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i16, true>", []>;
-def SVELogicalImm32NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i32, true>", []>;
+def SVELogicalImm8NotPat  : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i8, true>", []>;
+def SVELogicalImm16NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16, true>", []>;
+def SVELogicalImm32NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32, true>", []>;
 def SVELogicalImm64NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64, true>", []>;
 
-def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
+def SVE8BitLslImm32 : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
+def SVE8BitLslImm64 : ComplexPattern<i64, 2, "SelectSVE8BitLslImm", [imm]>;
+class SVE8BitLslImm<ValueType ty> {
+  ComplexPattern Pat = !cond(
+    !eq(ty, i32): SVE8BitLslImm32,
+    !eq(ty, i64): SVE8BitLslImm64);
+}
 
 def SVEArithUImm8Pat  : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i8>", []>;
 def SVEArithUImm16Pat  : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i16>", []>;
 def SVEArithUImm32Pat  : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i32>", []>;
-def SVEArithUImm64Pat  : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i64>", []>;
-def SVEArithSImmPat  : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>;
+def SVEArithUImm64Pat  : ComplexPattern<i64, 1, "SelectSVEArithImm<MVT::i64>", []>;
+
+def SVEArithSImmPat32 : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>;
+def SVEArithSImmPat64 : ComplexPattern<i64, 1, "SelectSVESignedArithImm", []>;
 
 def SVEShiftImmL8  : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 7>",  []>;
 def SVEShiftImmL16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 15>", []>;
 def SVEShiftImmL32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 31>", []>;
-def SVEShiftImmL64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 63>", []>;
+def SVEShiftImmL64 : ComplexPattern<i64, 1, "SelectSVEShiftImm<0, 63>", []>;
 def SVEShiftImmR8  : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 8,  true>", []>;
 def SVEShiftImmR16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 16, true>", []>;
 def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>;
-def SVEShiftImmR64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 64, true>", []>;
+def SVEShiftImmR64 : ComplexPattern<i64, 1, "SelectSVEShiftImm<1, 64, true>", []>;
 
 def SVEAllActive : ComplexPattern<untyped, 0, "SelectAllActivePredicate", []>;
 
@@ -260,14 +268,14 @@ def sve_incdec_imm : Operand<i32>, TImmLeaf<i32, [{
 }
 
 // This allows i32 immediate extraction from i64 based arithmetic.
-def sve_cnt_mul_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, false>">;
-def sve_cnt_shl_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, true>">;
-
+def sve_cnt_mul_imm_i32 : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, false>">;
+def sve_cnt_mul_imm_i64 : ComplexPattern<i64, 1, "SelectCntImm<1, 16, 1, false>">;
+def sve_cnt_shl_imm     : ComplexPattern<i64, 1, "SelectCntImm<1, 16, 1, true>">;
 
-def sve_ext_imm_0_31  : ComplexPattern<i32, 1, "SelectEXTImm<31, 8>">;
-def sve_ext_imm_0_63  : ComplexPattern<i32, 1, "SelectEXTImm<63, 4>">;
-def sve_ext_imm_0_127 : ComplexPattern<i32, 1, "SelectEXTImm<127, 2>">;
-def sve_ext_imm_0_255 : ComplexPattern<i32, 1, "SelectEXTImm<255, 1>">;
+def sve_ext_imm_0_31  : ComplexPattern<i64, 1, "SelectEXTImm<31, 8>">;
+def sve_ext_imm_0_63  : ComplexPattern<i64, 1, "SelectEXTImm<63, 4>">;
+def sve_ext_imm_0_127 : ComplexPattern<i64, 1, "SelectEXTImm<127, 2>">;
+def sve_ext_imm_0_255 : ComplexPattern<i64, 1, "SelectEXTImm<255, 1>">;
 
 def int_aarch64_sve_cntp_oneuse : PatFrag<(ops node:$pred, node:$src2),
                                           (int_aarch64_sve_cntp node:$pred, node:$src2), [{
@@ -435,8 +443,8 @@ class SVE_4_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
 : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))),
       (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>;
 
-def SVEDup0 : ComplexPattern<i64, 0, "SelectDupZero", []>;
-def SVEDup0Undef : ComplexPattern<i64, 0, "SelectDupZeroOrUndef", []>;
+def SVEDup0 : ComplexPattern<vAny, 0, "SelectDupZero", []>;
+def SVEDup0Undef : ComplexPattern<vAny, 0, "SelectDupZeroOrUndef", []>;
 
 let AddedComplexity = 1 in {
 class SVE_3_Op_Pat_SelZero<ValueType vtd, SDPatternOperator op, ValueType vt1,
@@ -868,10 +876,10 @@ multiclass sve_int_count<bits<3> opc, string asm, SDPatternOperator op> {
   def : InstAlias<asm # "\t$Rd",
                   (!cast<Instruction>(NAME) GPR64:$Rd, 0b11111, 1), 2>;
 
-  def : Pat<(i64 (mul (op sve_pred_enum:$pattern), (sve_cnt_mul_imm i32:$imm))),
+  def : Pat<(i64 (mul (op sve_pred_enum:$pattern), (sve_cnt_mul_imm_i64 i32:$imm))),
             (!cast<Instruction>(NAME) sve_pred_enum:$pattern, sve_incdec_imm:$imm)>;
 
-  def : Pat<(i64 (shl (op sve_pred_enum:$pattern), (i64 (sve_cnt_shl_imm i32:$imm)))),
+  def : Pat<(i64 (shl (op sve_pred_enum:$pattern), (sve_cnt_shl_imm i32:$imm))),
             (!cast<Instruction>(NAME) sve_pred_enum:$pattern, sve_incdec_imm:$imm)>;
 
   def : Pat<(i64 (op sve_pred_enum:$pattern)),
@@ -951,10 +959,10 @@ multiclass sve_int_pred_pattern_a<bits<3> opc, string asm,
     def : Pat<(i64 (op GPR64:$Rdn, (opcnt sve_pred_enum:$pattern))),
               (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1)>;
 
-    def : Pat<(i64 (op GPR64:$Rdn, (mul (opcnt sve_pred_enum:$pattern), (sve_cnt_mul_imm i32:$imm)))),
+    def : Pat<(i64 (op GPR64:$Rdn, (mul (opcnt sve_pred_enum:$pattern), (sve_cnt_mul_imm_i64 i32:$imm)))),
               (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, $imm)>;
 
-    def : Pat<(i64 (op GPR64:$Rdn, (shl (opcnt sve_pred_enum:$pattern), (i64 (sve_cnt_shl_imm i32:$imm))))),
+    def : Pat<(i64 (op GPR64:$Rdn, (shl (opcnt sve_pred_enum:$pattern), (sve_cnt_shl_imm i32:$imm)))),
               (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, $imm)>;
 
     def : Pat<(i32 (op GPR32:$Rdn, (i32 (trunc (opcnt (sve_pred_enum:$pattern)))))),
@@ -962,12 +970,12 @@ multiclass sve_int_pred_pattern_a<bits<3> opc, string asm,
                                                GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, 1),
                                     sub_32))>;
 
-    def : Pat<(i32 (op GPR32:$Rdn, (mul (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_mul_imm i32:$imm)))),
+    def : Pat<(i32 (op GPR32:$Rdn, (mul (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_mul_imm_i32 i32:$imm)))),
               (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
                                                GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm),
                                     sub_32))>;
 
-    def : Pat<(i32 (op GPR32:$Rdn, (shl (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (i64 (sve_cnt_shl_imm i32:$imm))))),
+    def : Pat<(i32 (op GPR32:$Rdn, (shl (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_shl_imm i32:$imm)))),
               (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
                                                GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm),
                                     sub_32))>;
@@ -4324,10 +4332,10 @@ multiclass sve_int_arith_imm1<bits<2> opc, string asm, SDPatternOperator op> {
   def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>;
   def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>;
 
-  def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1,  op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1,  op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1,  op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1,  op, ZPR16, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1,  op, ZPR32, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1,  op, ZPR64, i64, SVEArithSImmPat64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperator op> {
@@ -4348,10 +4356,10 @@ multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
   def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
   def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;
 
-  def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4542,7 +4550,7 @@ multiclass sve_int_dup_imm_pred_merge_inst<
                   (!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>;
   def : Pat<(intty
               (vselect predty:$Pg,
-                (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))),
+                (intty (AArch64dup (scalarty (SVE8BitLslImm<scalarty>.Pat i32:$imm, i32:$shift)))),
                 intty:$Zd)),
             (!cast<Instruction>(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>;
 }
@@ -4580,7 +4588,7 @@ multiclass sve_int_dup_imm_pred_zero_inst<
             (!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>;
   def : Pat<(intty
               (vselect predty:$Pg,
-                (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))),
+                (intty (AArch64dup (scalarty (SVE8BitLslImm<scalarty>.Pat i32:$imm, i32:$shift)))),
                 (intty (AArch64dup (scalarty 0))))),
             (!cast<Instruction>(NAME) $Pg, i32:$imm, i32:$shift)>;
 }
@@ -6476,14 +6484,14 @@ multiclass sve_int_perm_rev_revh<string asm, SDPatternOperator op> {
   def _S : sve_int_perm_rev<0b10, 0b01, asm, ZPR32>;
   def _D : sve_int_perm_rev<0b11, 0b01, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_perm_rev_revw<string asm, SDPatternOperator op> {
   def _D : sve_int_perm_rev<0b11, 0b10, asm, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -8377,13 +8385,13 @@ multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatter
 }
 
 /// Addressing modes
-def am_sve_indexed_s4 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>;
-def am_sve_indexed_s6 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-32,31>", [], [SDNPWantRoot]>;
+def am_sve_indexed_s4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>;
+def am_sve_indexed_s6 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-32,31>", [], [SDNPWantRoot]>;
 
-def am_sve_regreg_lsl0 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<0>", []>;
-def am_sve_regreg_lsl1 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<1>", []>;
-def am_sve_regreg_lsl2 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<2>", []>;
-def am_sve_regreg_lsl3 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<3>", []>;
+def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", []>;
+def am_sve_regreg_lsl1 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<1>", []>;
+def am_sve_regreg_lsl2 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<2>", []>;
+def am_sve_regreg_lsl3 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<3>", []>;
 
 // Predicated pseudo floating point two operand instructions.
 multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index e72dccdc4b78..642080a0d40d 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -152,7 +152,7 @@ bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
   // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving
   // behind only the ptrues to be coalesced.
   PTrues.remove(MostEncompassingPTrue);
-  PTrues.remove_if([](auto *PTrue) { return isPTruePromoted(PTrue); });
+  PTrues.remove_if(isPTruePromoted);
 
   // Hoist MostEncompassingPTrue to the start of the basic block. It is always
   // safe to do this, since ptrue intrinsic calls are guaranteed to have no
@@ -287,10 +287,10 @@ bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) {
   if (!Attr.isValid())
     return false;
 
-  unsigned MinVScale, MaxVScale;
-  std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs();
+  unsigned MinVScale = Attr.getVScaleRangeMin();
+  Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax();
   // The transform needs to know the exact runtime length of scalable vectors
-  if (MinVScale != MaxVScale || MinVScale == 0)
+  if (!MaxVScale || MinVScale != MaxVScale)
     return false;
 
   auto *PredType =
@@ -351,10 +351,10 @@ bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) {
   if (!Attr.isValid())
     return false;
 
-  unsigned MinVScale, MaxVScale;
-  std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs();
+  unsigned MinVScale = Attr.getVScaleRangeMin();
+  Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax();
   // The transform needs to know the exact runtime length of scalable vectors
-  if (MinVScale != MaxVScale || MinVScale == 0)
+  if (!MaxVScale || MinVScale != MaxVScale)
     return false;
 
   auto *PredType =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index aab76d27ef11..d28f38e42430 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -173,14 +173,7 @@ constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
 const AMDGPUFunctionArgInfo &
 AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
   auto I = ArgInfoMap.find(&F);
-  if (I == ArgInfoMap.end()) {
-    if (AMDGPUTargetMachine::EnableFixedFunctionABI)
-      return FixedABIFunctionInfo;
-
-    // Without the fixed ABI, we assume no function has special inputs.
-    assert(F.isDeclaration());
-    return ExternFunctionInfo;
-  }
-
+  if (I == ArgInfoMap.end())
+    return FixedABIFunctionInfo;
   return I->second;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index f0aadab3302f..b4ebc7d7d75f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -112,6 +112,17 @@ static bool isDSAddress(const Constant *C) {
   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
 }
 
+/// Returns true if the function requires the implicit argument be passed
+/// regardless of the function contents.
+static bool funcRequiresImplicitArgPtr(const Function &F) {
+  // Sanitizers require the hostcall buffer passed in the implicit arguments.
+  return F.hasFnAttribute(Attribute::SanitizeAddress) ||
+         F.hasFnAttribute(Attribute::SanitizeThread) ||
+         F.hasFnAttribute(Attribute::SanitizeMemory) ||
+         F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
+         F.hasFnAttribute(Attribute::SanitizeMemTag);
+}
+
 namespace {
 class AMDGPUInformationCache : public InformationCache {
 public:
@@ -296,7 +307,7 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
 
     bool AllCallSitesKnown = true;
     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
-      indicatePessimisticFixpoint();
+      return indicatePessimisticFixpoint();
 
     return Change;
   }
@@ -339,7 +350,17 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
 
   void initialize(Attributor &A) override {
     Function *F = getAssociatedFunction();
+
+    // If the function requires the implicit arg pointer due to sanitizers,
+    // assume it's needed even if explicitly marked as not requiring it.
+    const bool NeedsImplicit = funcRequiresImplicitArgPtr(*F);
+    if (NeedsImplicit)
+      removeAssumedBits(IMPLICIT_ARG_PTR);
+
     for (auto Attr : ImplicitAttrs) {
+      if (NeedsImplicit && Attr.first == IMPLICIT_ARG_PTR)
+        continue;
+
       if (F->hasFnAttribute(Attr.second))
         addKnownBits(Attr.first);
     }
@@ -500,6 +521,9 @@ struct AAAMDFlatWorkGroupSize
     std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
     intersectKnown(
         ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
+
+    if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+      indicatePessimisticFixpoint();
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 43928d7c2a09..2f1e7823f65c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -652,8 +652,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(
       ++PSInputNum;
 
       if (SkipArg) {
-        for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
-          B.buildUndef(VRegs[Idx][I]);
+        for (Register R : VRegs[Idx])
+          B.buildUndef(R);
 
         ++Idx;
         continue;
@@ -715,10 +715,9 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   if (!MBB.empty())
     B.setInstr(*MBB.begin());
 
-  if (!IsEntryFunc) {
+  if (!IsEntryFunc && !IsGraphics) {
     // For the fixed ABI, pass workitem IDs in the last argument register.
-    if (AMDGPUTargetMachine::EnableFixedFunctionABI)
-      TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+    TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
   }
 
   IncomingValueAssigner Assigner(AssignFn);
@@ -731,11 +730,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
 
   uint64_t StackOffset = Assigner.StackOffset;
 
-  if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
-    // Special inputs come after user arguments.
-    TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
-  }
-
   // Start adding system SGPRs.
   if (IsEntryFunc) {
     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
@@ -829,9 +823,12 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
 
     if (IncomingArg) {
       LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
-    } else {
-      assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+    } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
       LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
+    } else {
+      // We may have proven the input wasn't needed, although the ABI is
+      // requiring it. We just need to allocate the register appropriately.
+      MIRBuilder.buildUndef(InputReg);
     }
 
     if (OutgoingArg->isRegister()) {
@@ -1233,8 +1230,7 @@ bool AMDGPUCallLowering::lowerTailCall(
   // after the ordinary user argument registers.
   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
 
-  if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      Info.CallConv != CallingConv::AMDGPU_Gfx) {
+  if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
       return false;
@@ -1300,12 +1296,6 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      Info.CallConv != CallingConv::AMDGPU_Gfx) {
-    LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
-    return false;
-  }
-
   SmallVector<ArgInfo, 8> OutArgs;
   for (auto &OrigArg : Info.OrigArgs)
     splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
@@ -1359,8 +1349,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // after the ordinary user argument registers.
   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
 
-  if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      Info.CallConv != CallingConv::AMDGPU_Gfx) {
+  if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
       return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index c7c5ff7bcbe7..2415fdfecaae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -64,6 +64,30 @@ def int_minmax_to_med3 : GICombineRule<
          [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
   (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;
 
+def fp_minmax_to_med3 : GICombineRule<
+  (defs root:$min_or_max, med3_matchdata:$matchinfo),
+  (match (wip_match_opcode G_FMAXNUM,
+                           G_FMINNUM,
+                           G_FMAXNUM_IEEE,
+                           G_FMINNUM_IEEE):$min_or_max,
+         [{ return RegBankHelper.matchFPMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
+  (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+
+def fp_minmax_to_clamp : GICombineRule<
+  (defs root:$min_or_max, register_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_FMAXNUM,
+                           G_FMINNUM,
+                           G_FMAXNUM_IEEE,
+                           G_FMINNUM_IEEE):$min_or_max,
+         [{ return RegBankHelper.matchFPMinMaxToClamp(*${min_or_max}, ${matchinfo}); }]),
+  (apply [{ RegBankHelper.applyClamp(*${min_or_max}, ${matchinfo}); }])>;
+
+def fmed3_intrinsic_to_clamp : GICombineRule<
+  (defs root:$fmed3, register_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_INTRINSIC):$fmed3,
+         [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]),
+  (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>;
+
 def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">;
 
 def remove_fcanonicalize : GICombineRule<
@@ -102,7 +126,9 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
 }
 
 def AMDGPURegBankCombinerHelper : GICombinerHelper<
-  "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> {
+  "AMDGPUGenRegBankCombinerHelper",
+  [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
+   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
   let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
   let StateClass = "AMDGPURegBankCombinerHelperState";
   let AdditionalArguments = [];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index 301e6f6d6f42..e79ff9b597c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -378,5 +378,4 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
   }
 
   MI.eraseFromParent();
-  return;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 12cef2774aaf..7fd94a977be7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -172,6 +172,8 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
 def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
 def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>;
 def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>;
+def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>;
+def : GINodeEquiv<G_AMDGPU_CLAMP, AMDGPUclamp>;
 
 def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b9c59f4c615a..699c6c479455 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -280,11 +280,12 @@ void MetadataStreamerV2::emitKernelAttrs(const Function &Func) {
   }
 }
 
-void MetadataStreamerV2::emitKernelArgs(const Function &Func) {
+void MetadataStreamerV2::emitKernelArgs(const Function &Func,
+                                        const GCNSubtarget &ST) {
   for (auto &Arg : Func.args())
     emitKernelArg(Arg);
 
-  emitHiddenKernelArgs(Func);
+  emitHiddenKernelArgs(Func, ST);
 }
 
 void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
@@ -381,10 +382,9 @@ void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
   }
 }
 
-void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
-  int HiddenArgNumBytes =
-      getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
-
+void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func,
+                                              const GCNSubtarget &ST) {
+  unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
   if (!HiddenArgNumBytes)
     return;
 
@@ -465,11 +465,12 @@ void MetadataStreamerV2::emitKernel(const MachineFunction &MF,
   HSAMetadata.mKernels.push_back(Kernel::Metadata());
   auto &Kernel = HSAMetadata.mKernels.back();
 
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   Kernel.mName = std::string(Func.getName());
   Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str();
   emitKernelLanguage(Func);
   emitKernelAttrs(Func);
-  emitKernelArgs(Func);
+  emitKernelArgs(Func, ST);
   HSAMetadata.mKernels.back().mCodeProps = CodeProps;
   HSAMetadata.mKernels.back().mDebugProps = DebugProps;
 }
@@ -673,13 +674,14 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
 }
 
 void MetadataStreamerV3::emitKernelArgs(const Function &Func,
+                                        const GCNSubtarget &ST,
                                         msgpack::MapDocNode Kern) {
   unsigned Offset = 0;
   auto Args = HSAMetadataDoc->getArrayNode();
   for (auto &Arg : Func.args())
     emitKernelArg(Arg, Offset, Args);
 
-  emitHiddenKernelArgs(Func, Offset, Args);
+  emitHiddenKernelArgs(Func, ST, Offset, Args);
 
   Kern[".args"] = Args;
 }
@@ -791,11 +793,10 @@ void MetadataStreamerV3::emitKernelArg(
 }
 
 void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
+                                              const GCNSubtarget &ST,
                                               unsigned &Offset,
                                               msgpack::ArrayDocNode Args) {
-  int HiddenArgNumBytes =
-      getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
-
+  unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
   if (!HiddenArgNumBytes)
     return;
 
@@ -912,6 +913,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
                                     const SIProgramInfo &ProgramInfo) {
   auto &Func = MF.getFunction();
   auto Kern = getHSAKernelProps(MF, ProgramInfo);
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
          Func.getCallingConv() == CallingConv::SPIR_KERNEL);
@@ -925,7 +927,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
         (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
     emitKernelLanguage(Func, Kern);
     emitKernelAttrs(Func, Kern);
-    emitKernelArgs(Func, Kern);
+    emitKernelArgs(Func, ST, Kern);
   }
 
   Kernels.push_back(Kern);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index af5dae1cd8c0..54ed0afbba6d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -30,6 +30,7 @@ class MDNode;
 class Module;
 struct SIProgramInfo;
 class Type;
+class GCNSubtarget;
 
 namespace AMDGPU {
 
@@ -86,7 +87,8 @@ protected:
 
   void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern);
 
-  void emitKernelArgs(const Function &Func, msgpack::MapDocNode Kern);
+  void emitKernelArgs(const Function &Func, const GCNSubtarget &ST,
+                      msgpack::MapDocNode Kern);
 
   void emitKernelArg(const Argument &Arg, unsigned &Offset,
                      msgpack::ArrayDocNode Args);
@@ -98,8 +100,8 @@ protected:
                      StringRef BaseTypeName = "", StringRef AccQual = "",
                      StringRef TypeQual = "");
 
-  void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
-                            msgpack::ArrayDocNode Args);
+  void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST,
+                            unsigned &Offset, msgpack::ArrayDocNode Args);
 
   msgpack::DocNode &getRootMetadata(StringRef Key) {
     return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key];
@@ -173,7 +175,7 @@ private:
 
   void emitKernelAttrs(const Function &Func);
 
-  void emitKernelArgs(const Function &Func);
+  void emitKernelArgs(const Function &Func, const GCNSubtarget &ST);
 
   void emitKernelArg(const Argument &Arg);
 
@@ -183,7 +185,7 @@ private:
                      StringRef BaseTypeName = "", StringRef AccQual = "",
                      StringRef TypeQual = "");
 
-  void emitHiddenKernelArgs(const Function &Func);
+  void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST);
 
   const Metadata &getHSAMetadata() const {
     return HSAMetadata;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 88b4ec53a2a0..db84b8766924 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -892,6 +892,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     }
     break;
   }
+  case Intrinsic::amdgcn_is_shared:
+  case Intrinsic::amdgcn_is_private: {
+    if (isa<UndefValue>(II.getArgOperand(0)))
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+
+    if (isa<ConstantPointerNull>(II.getArgOperand(0)))
+      return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
+    break;
+  }
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1f898f2ba8b3..5046daaed977 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -533,7 +533,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
       .legalFor({S32, S16, V2S16})
       .minScalar(0, S16)
-      .clampMaxNumElements(0, S16, 2)
+      .clampMaxNumElementsStrict(0, S16, 2)
       .widenScalarToNextMultipleOf(0, 32)
       .maxScalar(0, S32)
       .scalarize(0);
@@ -541,7 +541,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
       .legalFor({S32, S16, V2S16}) // Clamp modifier
       .minScalarOrElt(0, S16)
-      .clampMaxNumElements(0, S16, 2)
+      .clampMaxNumElementsStrict(0, S16, 2)
       .scalarize(0)
       .widenScalarToNextPow2(0, 32)
       .lower();
@@ -712,7 +712,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   }
 
   if (ST.hasVOP3PInsts())
-    FPOpActions.clampMaxNumElements(0, S16, 2);
+    FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
 
   FPOpActions
     .scalarize(0)
@@ -728,7 +728,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   getActionDefinitionsBuilder({G_FNEG, G_FABS})
     .legalFor(FPTypesPK16)
-    .clampMaxNumElements(0, S16, 2)
+    .clampMaxNumElementsStrict(0, S16, 2)
     .scalarize(0)
     .clampScalar(0, S16, S64);
 
@@ -965,7 +965,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   if (ST.has16BitInsts()) {
     getActionDefinitionsBuilder(G_BSWAP)
       .legalFor({S16, S32, V2S16})
-      .clampMaxNumElements(0, S16, 2)
+      .clampMaxNumElementsStrict(0, S16, 2)
       // FIXME: Fixing non-power-of-2 before clamp is workaround for
       // narrowScalar limitation.
       .widenScalarToNextPow2(0)
@@ -1052,10 +1052,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
     // Split vector extloads.
     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
-    unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
-
-    if (MemSize < DstTy.getSizeInBits())
-      MemSize = std::max(MemSize, AlignBits);
 
     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
       return true;
@@ -1077,12 +1073,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         return true;
     }
 
-    if (AlignBits < MemSize) {
-      const SITargetLowering *TLI = ST.getTargetLowering();
-      return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
-                                                      Align(AlignBits / 8));
-    }
-
     return false;
   };
 
@@ -1176,20 +1166,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
               if (DstSize > MemSize)
                 return std::make_pair(0, LLT::scalar(MemSize));
 
-              if (!isPowerOf2_32(DstSize)) {
-                // We're probably decomposing an odd sized store. Try to split
-                // to the widest type. TODO: Account for alignment. As-is it
-                // should be OK, since the new parts will be further legalized.
-                unsigned FloorSize = PowerOf2Floor(DstSize);
-                return std::make_pair(0, LLT::scalar(FloorSize));
-              }
-
-              if (DstSize > 32 && (DstSize % 32 != 0)) {
-                // FIXME: Need a way to specify non-extload of larger size if
-                // suitably aligned.
-                return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
-              }
-
               unsigned MaxSize = maxSizeForAddrSpace(ST,
                                                      PtrTy.getAddressSpace(),
                                                      Op == G_LOAD);
@@ -1257,14 +1233,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
               }
 
-              // Need to split because of alignment.
-              unsigned Align = Query.MMODescrs[0].AlignInBits;
-              if (EltSize > Align &&
-                  (EltSize / Align < DstTy.getNumElements())) {
-                return std::make_pair(
-                    0, LLT::fixed_vector(EltSize / Align, EltTy));
-              }
-
               // May need relegalization for the scalars.
               return std::make_pair(0, EltTy);
             })
@@ -1457,6 +1425,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     // FIXME: Doesn't handle extract of illegal sizes.
     getActionDefinitionsBuilder(Op)
       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
+      .lowerIf([=](const LegalityQuery &Query) {
+          // Sub-vector(or single element) insert and extract.
+          // TODO: verify immediate offset here since lower only works with
+          // whole elements.
+          const LLT BigTy = Query.Types[BigTyIdx];
+          return BigTy.isVector();
+        })
       // FIXME: Multiples of 16 should not be legal.
       .legalIf([=](const LegalityQuery &Query) {
           const LLT BigTy = Query.Types[BigTyIdx];
@@ -1615,7 +1590,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
       // get more vector shift opportunities, since we'll get those when
       // expanded.
-      .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
+      .clampMaxNumElementsStrict(0, S16, 2);
   } else if (ST.has16BitInsts()) {
     SextInReg.lowerFor({{S32}, {S64}, {S16}});
   } else {
@@ -1637,14 +1612,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   getActionDefinitionsBuilder(G_FSHR)
     .legalFor({{S32, S32}})
     .lowerFor({{V2S16, V2S16}})
-    .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
+    .clampMaxNumElementsStrict(0, S16, 2)
     .scalarize(0)
     .lower();
 
   if (ST.hasVOP3PInsts()) {
     getActionDefinitionsBuilder(G_FSHL)
       .lowerFor({{V2S16, V2S16}})
-      .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
+      .clampMaxNumElementsStrict(0, S16, 2)
       .scalarize(0)
       .lower();
   } else {
@@ -2567,10 +2542,8 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
       } else {
         // For cases where the widened type isn't a nice register value, unmerge
         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
-        B.setInsertPt(B.getMBB(), ++B.getInsertPt());
-        WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
-        B.setInsertPt(B.getMBB(), MI.getIterator());
-        B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
+        WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
+        B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
       }
     }
 
@@ -3843,6 +3816,10 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
     llvm_unreachable("invalid data type");
   }
 
+  if (StoreVT == LLT::fixed_vector(3, S16)) {
+    Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
+              .getReg(0);
+  }
   return Reg;
 }
 
@@ -4237,8 +4214,17 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
         (I >= Intr->CoordStart && !IsA16)) {
       // Handle any gradient or coordinate operands that should not be packed
-      AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
-      PackedAddrs.push_back(AddrReg);
+      if ((I < Intr->GradientStart) && IsA16 &&
+          (B.getMRI()->getType(AddrReg) == S16)) {
+        // Special handling of bias when A16 is on. Bias is of type half but
+        // occupies full 32-bit.
+        PackedAddrs.push_back(
+            B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
+                .getReg(0));
+      } else {
+        AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
+        PackedAddrs.push_back(AddrReg);
+      }
     } else {
       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
       // derivatives dx/dh and dx/dv are packed with undef.
@@ -4676,9 +4662,23 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
   // Deal with the one annoying legal case.
   const LLT V3S16 = LLT::fixed_vector(3, 16);
   if (Ty == V3S16) {
-    padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
-    auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
-    B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
+    if (IsTFE) {
+      if (ResultRegs.size() == 1) {
+        NewResultReg = ResultRegs[0];
+      } else if (ResultRegs.size() == 2) {
+        LLT V4S16 = LLT::fixed_vector(4, 16);
+        NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
+      } else {
+        return false;
+      }
+    }
+
+    if (MRI->getType(DstReg).getNumElements() <
+        MRI->getType(NewResultReg).getNumElements()) {
+      B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
+    } else {
+      B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
+    }
     return true;
   }
 
@@ -4869,8 +4869,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
   }
   Ops.push_back(RayExtent);
 
-  auto packLanes = [&Ops, &S32, &B] (Register Src) {
-    auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
+  auto packLanes = [&Ops, &S32, &B](Register Src) {
+    auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
     Ops.push_back(Unmerge.getReg(0));
     Ops.push_back(Unmerge.getReg(1));
     Ops.push_back(Unmerge.getReg(2));
@@ -4878,8 +4878,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
 
   packLanes(RayOrigin);
   if (IsA16) {
-    auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
-    auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
+    auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
+    auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
     Register R1 = MRI.createGenericVirtualRegister(S32);
     Register R2 = MRI.createGenericVirtualRegister(S32);
     Register R3 = MRI.createGenericVirtualRegister(S32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 12d6d35a6917..6e2b5dc471bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -24,13 +24,6 @@
 // A possible future refinement is to specialise the structure per-kernel, so
 // that fields can be elided based on more expensive analysis.
 //
-// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct
-// type which would cause allocating huge memory for struct instance within
-// every kernel. Hence, before running this pass, it is advisable to run the
-// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within
-// non-kernel functions by pointers and thereby minimizes the unnecessary per
-// kernel allocation of LDS memory.
-//
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
@@ -62,6 +55,20 @@ static cl::opt<bool> SuperAlignLDSGlobals(
 
 namespace {
 
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
+  SmallPtrSet<GlobalValue *, 32> UsedList;
+
+  SmallVector<GlobalValue *, 32> TmpVec;
+  collectUsedGlobalVariables(M, TmpVec, true);
+  UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+  TmpVec.clear();
+  collectUsedGlobalVariables(M, TmpVec, false);
+  UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+  return UsedList;
+}
+
 class AMDGPULowerModuleLDS : public ModulePass {
 
   static void removeFromUsedList(Module &M, StringRef Name,
@@ -105,11 +112,9 @@ class AMDGPULowerModuleLDS : public ModulePass {
   removeFromUsedLists(Module &M,
                       const std::vector<GlobalVariable *> &LocalVars) {
     SmallPtrSet<Constant *, 32> LocalVarsSet;
-    for (size_t I = 0; I < LocalVars.size(); I++) {
-      if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
+    for (GlobalVariable *LocalVar : LocalVars)
+      if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))
         LocalVarsSet.insert(C);
-      }
-    }
     removeFromUsedList(M, "llvm.used", LocalVarsSet);
     removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
   }
@@ -158,9 +163,9 @@ public:
   }
 
   bool runOnModule(Module &M) override {
-    UsedList = AMDGPU::getUsedList(M);
-
-    bool Changed = processUsedLDS(M);
+    UsedList = getUsedList(M);
+    bool Changed = superAlignLDSGlobals(M);
+    Changed |= processUsedLDS(M);
 
     for (Function &F : M.functions()) {
       if (F.isDeclaration())
@@ -177,6 +182,50 @@ public:
   }
 
 private:
+  // Increase the alignment of LDS globals if necessary to maximise the chance
+  // that we can use aligned LDS instructions to access them.
+  static bool superAlignLDSGlobals(Module &M) {
+    const DataLayout &DL = M.getDataLayout();
+    bool Changed = false;
+    if (!SuperAlignLDSGlobals) {
+      return Changed;
+    }
+
+    for (auto &GV : M.globals()) {
+      if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+        // Only changing alignment of LDS variables
+        continue;
+      }
+      if (!GV.hasInitializer()) {
+        // cuda/hip extern __shared__ variable, leave alignment alone
+        continue;
+      }
+
+      Align Alignment = AMDGPU::getAlign(DL, &GV);
+      TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
+
+      if (GVSize > 8) {
+        // We might want to use a b96 or b128 load/store
+        Alignment = std::max(Alignment, Align(16));
+      } else if (GVSize > 4) {
+        // We might want to use a b64 load/store
+        Alignment = std::max(Alignment, Align(8));
+      } else if (GVSize > 2) {
+        // We might want to use a b32 load/store
+        Alignment = std::max(Alignment, Align(4));
+      } else if (GVSize > 1) {
+        // We might want to use a b16 load/store
+        Alignment = std::max(Alignment, Align(2));
+      }
+
+      if (Alignment != AMDGPU::getAlign(DL, &GV)) {
+        Changed = true;
+        GV.setAlignment(Alignment);
+      }
+    }
+    return Changed;
+  }
+
   bool processUsedLDS(Module &M, Function *F = nullptr) {
     LLVMContext &Ctx = M.getContext();
     const DataLayout &DL = M.getDataLayout();
@@ -190,31 +239,6 @@ private:
       return false;
     }
 
-    // Increase the alignment of LDS globals if necessary to maximise the chance
-    // that we can use aligned LDS instructions to access them.
-    if (SuperAlignLDSGlobals) {
-      for (auto *GV : FoundLocalVars) {
-        Align Alignment = AMDGPU::getAlign(DL, GV);
-        TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());
-
-        if (GVSize > 8) {
-          // We might want to use a b96 or b128 load/store
-          Alignment = std::max(Alignment, Align(16));
-        } else if (GVSize > 4) {
-          // We might want to use a b64 load/store
-          Alignment = std::max(Alignment, Align(8));
-        } else if (GVSize > 2) {
-          // We might want to use a b32 load/store
-          Alignment = std::max(Alignment, Align(4));
-        } else if (GVSize > 1) {
-          // We might want to use a b16 load/store
-          Alignment = std::max(Alignment, Align(2));
-        }
-
-        GV->setAlignment(Alignment);
-      }
-    }
-
     SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
     LayoutFields.reserve(FoundLocalVars.size());
     for (GlobalVariable *GV : FoundLocalVars) {
@@ -343,20 +367,14 @@ private:
       refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
     }
 
-    // Mark kernels with asm that reads the address of the allocated structure
-    // This is not necessary for lowering. This lets other passes, specifically
-    // PromoteAlloca, accurately calculate how much LDS will be used by the
-    // kernel after lowering.
+    // This ensures the variable is allocated when called functions access it.
+    // It also lets other passes, specifically PromoteAlloca, accurately
+    // calculate how much LDS will be used by the kernel after lowering.
     if (!F) {
       IRBuilder<> Builder(Ctx);
-      SmallPtrSet<Function *, 32> Kernels;
       for (Function &Func : M.functions()) {
-        if (Func.isDeclaration())
-          continue;
-
-        if (AMDGPU::isKernelCC(&Func) && !Kernels.contains(&Func)) {
+        if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
           markUsedByKernel(Builder, &Func, SGV);
-          Kernels.insert(&Func);
         }
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 5d4b007f11e6..4e2f98d2a5db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -2786,12 +2786,8 @@ AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned Selec
     // Fixme: Move linearization creation to the original spot
     createLinearizedRegion(Region, SelectOut);
 
-    for (auto CI = Region->getChildren()->begin(),
-              CE = Region->getChildren()->end();
-         CI != CE; ++CI) {
-      InnerSelectOut =
-          initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII);
-    }
+    for (auto *CI : *Region->getChildren())
+      InnerSelectOut = initializeSelectRegisters(CI, InnerSelectOut, MRI, TII);
     MRT->setBBSelectRegIn(InnerSelectOut);
     return InnerSelectOut;
   } else {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 2aa02299ecdc..8ad344816ad2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -119,31 +119,27 @@ private:
   bool isConstantAddr(const Value *V) const;
 };
 
-static const Value *getMemoryInstrPtr(const Instruction *Inst) {
-  if (auto LI = dyn_cast<LoadInst>(Inst)) {
-    return LI->getPointerOperand();
-  }
-  if (auto SI = dyn_cast<StoreInst>(Inst)) {
-    return SI->getPointerOperand();
-  }
-  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
-    return AI->getPointerOperand();
-  }
-  if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
-    return AI->getPointerOperand();
-  }
-  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
-    return MI->getRawDest();
-  }
+static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
+    const Instruction *Inst) {
+  if (auto LI = dyn_cast<LoadInst>(Inst))
+    return {LI->getPointerOperand(), LI->getType()};
+  if (auto SI = dyn_cast<StoreInst>(Inst))
+    return {SI->getPointerOperand(), SI->getValueOperand()->getType()};
+  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst))
+    return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
+  if (auto AI = dyn_cast<AtomicRMWInst>(Inst))
+    return {AI->getPointerOperand(), AI->getValOperand()->getType()};
+  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst))
+    return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())};
 
-  return nullptr;
+  return {nullptr, nullptr};
 }
 
 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
   LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
   SmallSet<const Value *, 32> WorkSet;
   SmallSet<const Value *, 32> Visited;
-  if (const Value *MO = getMemoryInstrPtr(Inst)) {
+  if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
     if (isGlobalAddr(MO))
       WorkSet.insert(MO);
   }
@@ -209,10 +205,8 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
   for (auto &B : F) {
     LastAccess = MemAccessInfo();
     for (auto &I : B) {
-      if (const Value *Ptr = getMemoryInstrPtr(&I)) {
-        unsigned Size = divideCeil(
-            Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
-            32);
+      if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
+        unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
         if (isIndirectAccess(&I))
           FI.IAMInstCost += Size;
         if (isLargeStride(&I))
@@ -326,7 +320,7 @@ bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
 AMDGPUPerfHint::MemAccessInfo
 AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
   MemAccessInfo MAI;
-  const Value *MO = getMemoryInstrPtr(Inst);
+  const Value *MO = getMemoryInstrPtrAndType(Inst).first;
 
   LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
   // Do not treat local-addr memory access as large stride.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 3ec5dd7e0eff..f9a9fe403ff6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -939,7 +939,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
       GlobalVariable::NotThreadLocal,
       AMDGPUAS::LOCAL_ADDRESS);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(MaybeAlign(I.getAlignment()));
+  GV->setAlignment(I.getAlign());
 
   Value *TCntY, *TCntZ;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 12b5830ef930..3ce67a733c10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -16,6 +16,7 @@
 #include "AMDGPURegisterBankInfo.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -23,6 +24,7 @@
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Target/TargetMachine.h"
 #define DEBUG_TYPE "amdgpu-regbank-combiner"
 
@@ -36,13 +38,15 @@ protected:
   MachineRegisterInfo &MRI;
   const RegisterBankInfo &RBI;
   const TargetRegisterInfo &TRI;
+  const SIInstrInfo &TII;
   CombinerHelper &Helper;
 
 public:
   AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
       : B(B), MF(B.getMF()), MRI(*B.getMRI()),
         RBI(*MF.getSubtarget().getRegBankInfo()),
-        TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){};
+        TRI(*MF.getSubtarget().getRegisterInfo()),
+        TII(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()), Helper(Helper){};
 
   bool isVgprRegBank(Register Reg);
   Register getAsVgpr(Register Reg);
@@ -63,7 +67,19 @@ public:
                 Register &Val, CstTy &K0, CstTy &K1);
 
   bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+  bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+  bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg);
+  bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg);
   void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+  void applyClamp(MachineInstr &MI, Register &Reg);
+
+private:
+  AMDGPU::SIModeRegisterDefaults getMode();
+  bool getIEEE();
+  bool getDX10Clamp();
+  bool isFminnumIeee(const MachineInstr &MI);
+  bool isFCst(MachineInstr *MI);
+  bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1);
 };
 
 bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) {
@@ -98,6 +114,13 @@ AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) {
   case AMDGPU::G_UMAX:
   case AMDGPU::G_UMIN:
     return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3};
+  case AMDGPU::G_FMAXNUM:
+  case AMDGPU::G_FMINNUM:
+    return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3};
+  case AMDGPU::G_FMAXNUM_IEEE:
+  case AMDGPU::G_FMINNUM_IEEE:
+    return {AMDGPU::G_FMINNUM_IEEE, AMDGPU::G_FMAXNUM_IEEE,
+            AMDGPU::G_AMDGPU_FMED3};
   }
 }
 
@@ -148,6 +171,146 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
   return true;
 }
 
+// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1)
+// ieee = true  : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K
+// ieee = false : min/max(NaN, K) = K
+// clamp(NaN) = dx10_clamp ? 0.0 : NaN
+// Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input.
+// Other operand commutes (see matchMed) give same result since min and max are
+// commutative.
+
+// Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1
+// with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0.
+// Val = SNaN only for ieee = true
+// fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1
+// min(max(SNaN, K0), K1) = min(QNaN, K1) = K1
+// max(min(SNaN, K1), K0) = max(K1, K0) = K1
+// Val = NaN,ieee = false or Val = QNaN,ieee = true
+// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0
+// min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true)
+// max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0
+bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3(
+    MachineInstr &MI, Med3MatchInfo &MatchInfo) {
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  if (Ty != LLT::scalar(16) && Ty != LLT::scalar(32))
+    return false;
+
+  auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
+
+  Register Val;
+  Optional<FPValueAndVReg> K0, K1;
+  // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
+  if (!matchMed<GFCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
+    return false;
+
+  if (K0->Value > K1->Value)
+    return false;
+
+  // For IEEE=false perform combine only when it's safe to assume that there are
+  // no NaN inputs. Most often MI is marked with nnan fast math flag.
+  // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to
+  // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner
+  // nodes(max/min) have same behavior when one input is NaN and other isn't.
+  // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN,
+  // also post-legalizer inputs to min/max are fcanonicalized (never SNaN).
+  if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) {
+    // Don't fold single use constant that can't be inlined.
+    if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) &&
+        (!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) {
+      MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg};
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI,
+                                                       Register &Reg) {
+  // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16).
+  auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
+  Register Val;
+  Optional<FPValueAndVReg> K0, K1;
+  // Match min(max(Val, K0), K1) or max(min(Val, K1), K0).
+  if (!matchMed<GFCstOrSplatGFCstMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
+    return false;
+
+  if (!K0->Value.isExactlyValue(0.0) || !K1->Value.isExactlyValue(1.0))
+    return false;
+
+  // For IEEE=false perform combine only when it's safe to assume that there are
+  // no NaN inputs. Most often MI is marked with nnan fast math flag.
+  // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates
+  // to 0.0 requires dx10_clamp = true.
+  if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) &&
+       isKnownNeverSNaN(Val, MRI)) ||
+      isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) {
+    Reg = Val;
+    return true;
+  }
+
+  return false;
+}
+
+// Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true.
+// Val = SNaN only for ieee = true. It is important which operand is NaN.
+// min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0
+// min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0
+// min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN
+// Val = NaN,ieee = false or Val = QNaN,ieee = true
+// min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0
+// min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0
+// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0
+bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI,
+                                                     Register &Reg) {
+  if (MI.getIntrinsicID() != Intrinsic::amdgcn_fmed3)
+    return false;
+
+  // In llvm-ir, clamp is often represented as an intrinsic call to
+  // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders.
+  MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+  MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI);
+  MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
+
+  if (isFCst(Src0) && !isFCst(Src1))
+    std::swap(Src0, Src1);
+  if (isFCst(Src1) && !isFCst(Src2))
+    std::swap(Src1, Src2);
+  if (isFCst(Src0) && !isFCst(Src1))
+    std::swap(Src0, Src1);
+  if (!isClampZeroToOne(Src1, Src2))
+    return false;
+
+  Register Val = Src0->getOperand(0).getReg();
+
+  auto isOp3Zero = [&]() {
+    MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
+    if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT)
+      return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0);
+    return false;
+  };
+  // For IEEE=false perform combine only when it's safe to assume that there are
+  // no NaN inputs. Most often MI is marked with nnan fast math flag.
+  // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold
+  // when Val could be QNaN. If Val can also be SNaN third input should be 0.0.
+  if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) ||
+      (getIEEE() && getDX10Clamp() &&
+       (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) {
+    Reg = Val;
+    return true;
+  }
+
+  return false;
+}
+
+void AMDGPURegBankCombinerHelper::applyClamp(MachineInstr &MI, Register &Reg) {
+  B.setInstrAndDebugLoc(MI);
+  B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
+               MI.getFlags());
+  MI.eraseFromParent();
+}
+
 void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
                                             Med3MatchInfo &MatchInfo) {
   B.setInstrAndDebugLoc(MI);
@@ -158,6 +321,33 @@ void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
+AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() {
+  return MF.getInfo<SIMachineFunctionInfo>()->getMode();
+}
+
+bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE; }
+
+bool AMDGPURegBankCombinerHelper::getDX10Clamp() { return getMode().DX10Clamp; }
+
+bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr &MI) {
+  return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE;
+}
+
+bool AMDGPURegBankCombinerHelper::isFCst(MachineInstr *MI) {
+  return MI->getOpcode() == AMDGPU::G_FCONSTANT;
+}
+
+bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0,
+                                                   MachineInstr *K1) {
+  if (isFCst(K0) && isFCst(K1)) {
+    const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm();
+    const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm();
+    return (KO_FPImm->isExactlyValue(0.0) && K1_FPImm->isExactlyValue(1.0)) ||
+           (KO_FPImm->isExactlyValue(1.0) && K1_FPImm->isExactlyValue(0.0));
+  }
+  return false;
+}
+
 class AMDGPURegBankCombinerHelperState {
 protected:
   CombinerHelper &Helper;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5988403c0a29..c60012bcfe2e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -707,9 +707,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   iterator_range<MachineBasicBlock::iterator> Range,
   SmallSet<Register, 4> &SGPROperandRegs,
   MachineRegisterInfo &MRI) const {
-  SmallVector<Register, 4> ResultRegs;
-  SmallVector<Register, 4> InitResultRegs;
-  SmallVector<Register, 4> PhiRegs;
 
   // Track use registers which have already been expanded with a readfirstlane
   // sequence. This may have multiple uses if moving a sequence.
@@ -774,15 +771,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
     .addReg(NewExec)
     .addMBB(LoopBB);
 
-  for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
-    B.buildInstr(TargetOpcode::G_PHI)
-      .addDef(std::get<2>(Result))
-      .addReg(std::get<0>(Result)) // Initial value / implicit_def
-      .addMBB(&MBB)
-      .addReg(std::get<1>(Result)) // Mid-loop value.
-      .addMBB(LoopBB);
-  }
-
   const DebugLoc &DL = B.getDL();
 
   MachineInstr &FirstInst = *Range.begin();
@@ -1174,18 +1162,25 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
       // 96-bit loads are only available for vector loads. We need to split this
       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
       if (MMO->getAlign() < Align(16)) {
+        MachineFunction *MF = MI.getParent()->getParent();
+        ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
+        MachineIRBuilder B(MI, ApplyBank);
+        LegalizerHelper Helper(*MF, ApplyBank, B);
         LLT Part64, Part32;
         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
-        auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
-        auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
-
-        auto Undef = B.buildUndef(LoadTy);
-        auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
-        B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
+        if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
+            LegalizerHelper::Legalized)
+          return false;
+        return true;
       } else {
         LLT WiderTy = widen96To128(LoadTy);
         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
-        B.buildExtract(MI.getOperand(0), WideLoad, 0);
+        if (WiderTy.isScalar())
+          B.buildTrunc(MI.getOperand(0), WideLoad);
+        else {
+          B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
+                                              WideLoad);
+        }
       }
     }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
index d55bf3917e9c..2475b44b42a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
@@ -87,6 +87,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -110,6 +111,18 @@ using namespace llvm;
 
 namespace {
 
+namespace AMDGPU {
+/// Collect all the instructions where user \p U belongs to. \p U could be
+/// instruction itself or it could be a constant expression which is used within
+/// an instruction. If \p CollectKernelInsts is true, collect instructions only
+/// from kernels, otherwise collect instructions only from non-kernel functions.
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts);
+
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);
+
+} // namespace AMDGPU
+
 class ReplaceLDSUseImpl {
   Module &M;
   LLVMContext &Ctx;
@@ -127,7 +140,8 @@ class ReplaceLDSUseImpl {
   // Collect LDS which requires their uses to be replaced by pointer.
   std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
     // Collect LDS which requires module lowering.
-    std::vector<GlobalVariable *> LDSGlobals = AMDGPU::findVariablesToLower(M);
+    std::vector<GlobalVariable *> LDSGlobals =
+        llvm::AMDGPU::findVariablesToLower(M);
 
     // Remove LDS which don't qualify for replacement.
     llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) {
@@ -172,7 +186,7 @@ class ReplaceLDSUseImpl {
         AMDGPUAS::LOCAL_ADDRESS);
 
     LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-    LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer));
+    LDSPointer->setAlignment(llvm::AMDGPU::getAlign(DL, LDSPointer));
 
     // Mark that an associated LDS pointer is created for LDS.
     LDSToPointer[GV] = LDSPointer;
@@ -245,10 +259,9 @@ class ReplaceLDSUseImpl {
       auto FunctionToInsts =
           AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/);
 
-      for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end();
-           FI != FE; ++FI) {
-        Function *F = FI->first;
-        auto &Insts = FI->second;
+      for (const auto &FunctionToInst : FunctionToInsts) {
+        Function *F = FunctionToInst.first;
+        auto &Insts = FunctionToInst.second;
         for (auto *I : Insts) {
           // If `U` is a constant expression, then we need to break the
           // associated instruction into a set of separate instructions by
@@ -341,10 +354,9 @@ bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
 
   // Traverse through each kernel K, check and if required, initialize the
   // LDS pointer to point to LDS within K.
-  for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE;
-       ++KI) {
-    Function *K = KI->first;
-    SmallPtrSet<Function *, 8> Callees = KI->second;
+  for (const auto &KernelToCallee : KernelToCallees) {
+    Function *K = KernelToCallee.first;
+    SmallPtrSet<Function *, 8> Callees = KernelToCallee.second;
 
     // Compute reachable and LDS used callees for kernel K.
     set_intersect(Callees, LDSAccessors);
@@ -378,6 +390,184 @@ bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
   return true;
 }
 
+namespace AMDGPU {
+
+// An helper class for collecting all reachable callees for each kernel defined
+// within the module.
+class CollectReachableCallees {
+  Module &M;
+  CallGraph CG;
+  SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
+
+  // Collect all address taken functions within the module.
+  void collectAddressTakenFunctions() {
+    auto *ECNode = CG.getExternalCallingNode();
+
+    for (const auto &GI : *ECNode) {
+      auto *CGN = GI.second;
+      auto *F = CGN->getFunction();
+      if (!F || F->isDeclaration() || llvm::AMDGPU::isKernelCC(F))
+        continue;
+      AddressTakenFunctions.insert(CGN);
+    }
+  }
+
+  // For given kernel, collect all its reachable non-kernel functions.
+  SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
+    SmallPtrSet<Function *, 8> ReachableCallees;
+
+    // Call graph node which represents this kernel.
+    auto *KCGN = CG[K];
+
+    // Go through all call graph nodes reachable from the node representing this
+    // kernel, visit all their call sites, if the call site is direct, add
+    // corresponding callee to reachable callee set, if it is indirect, resolve
+    // the indirect call site to potential reachable callees, add them to
+    // reachable callee set, and repeat the process for the newly added
+    // potential callee nodes.
+    //
+    // FIXME: Need to handle bit-casted function pointers.
+    //
+    SmallVector<CallGraphNode *, 8> CGNStack(depth_first(KCGN));
+    SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
+    while (!CGNStack.empty()) {
+      auto *CGN = CGNStack.pop_back_val();
+
+      if (!VisitedCGNodes.insert(CGN).second)
+        continue;
+
+      // Ignore call graph node which does not have associated function or
+      // associated function is not a definition.
+      if (!CGN->getFunction() || CGN->getFunction()->isDeclaration())
+        continue;
+
+      for (const auto &GI : *CGN) {
+        auto *RCB = cast<CallBase>(GI.first.getValue());
+        auto *RCGN = GI.second;
+
+        if (auto *DCallee = RCGN->getFunction()) {
+          ReachableCallees.insert(DCallee);
+        } else if (RCB->isIndirectCall()) {
+          auto *RCBFTy = RCB->getFunctionType();
+          for (auto *ACGN : AddressTakenFunctions) {
+            auto *ACallee = ACGN->getFunction();
+            if (ACallee->getFunctionType() == RCBFTy) {
+              ReachableCallees.insert(ACallee);
+              CGNStack.append(df_begin(ACGN), df_end(ACGN));
+            }
+          }
+        }
+      }
+    }
+
+    return ReachableCallees;
+  }
+
+public:
+  explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
+    // Collect address taken functions.
+    collectAddressTakenFunctions();
+  }
+
+  void collectReachableCallees(
+      DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+    // Collect reachable callee set for each kernel defined in the module.
+    for (Function &F : M.functions()) {
+      if (!llvm::AMDGPU::isKernelCC(&F))
+        continue;
+      Function *K = &F;
+      KernelToCallees[K] = collectReachableCallees(K);
+    }
+  }
+};
+
+/// Collect reachable callees for each kernel defined in the module \p M and
+/// return collected callees at \p KernelToCallees.
+void collectReachableCallees(
+    Module &M,
+    DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+  CollectReachableCallees CRC{M};
+  CRC.collectReachableCallees(KernelToCallees);
+}
+
+/// For the given LDS global \p GV, visit all its users and collect all
+/// non-kernel functions within which \p GV is used and return collected list of
+/// such non-kernel functions.
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
+  SmallPtrSet<Function *, 8> LDSAccessors;
+  SmallVector<User *, 8> UserStack(GV->users());
+  SmallPtrSet<User *, 8> VisitedUsers;
+
+  while (!UserStack.empty()) {
+    auto *U = UserStack.pop_back_val();
+
+    // `U` is already visited? continue to next one.
+    if (!VisitedUsers.insert(U).second)
+      continue;
+
+    // `U` is a global variable which is initialized with LDS. Ignore LDS.
+    if (isa<GlobalValue>(U))
+      return SmallPtrSet<Function *, 8>();
+
+    // Recursively explore constant users.
+    if (isa<Constant>(U)) {
+      append_range(UserStack, U->users());
+      continue;
+    }
+
+    // `U` should be an instruction, if it belongs to a non-kernel function F,
+    // then collect F.
+    Function *F = cast<Instruction>(U)->getFunction();
+    if (!llvm::AMDGPU::isKernelCC(F))
+      LDSAccessors.insert(F);
+  }
+
+  return LDSAccessors;
+}
+
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
+  DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
+  SmallVector<User *, 8> UserStack;
+  SmallPtrSet<User *, 8> VisitedUsers;
+
+  UserStack.push_back(U);
+
+  while (!UserStack.empty()) {
+    auto *UU = UserStack.pop_back_val();
+
+    if (!VisitedUsers.insert(UU).second)
+      continue;
+
+    if (isa<GlobalValue>(UU))
+      continue;
+
+    if (isa<Constant>(UU)) {
+      append_range(UserStack, UU->users());
+      continue;
+    }
+
+    auto *I = cast<Instruction>(UU);
+    Function *F = I->getFunction();
+    if (CollectKernelInsts) {
+      if (!llvm::AMDGPU::isKernelCC(F)) {
+        continue;
+      }
+    } else {
+      if (llvm::AMDGPU::isKernelCC(F)) {
+        continue;
+      }
+    }
+
+    FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
+    FunctionToInsts[F].insert(I);
+  }
+
+  return FunctionToInsts;
+}
+
+} // namespace AMDGPU
+
 // Entry-point function which interface ReplaceLDSUseImpl with outside of the
 // class.
 bool ReplaceLDSUseImpl::replaceLDSUse() {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 0655b4342ba1..cd05797fdbdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -413,21 +413,21 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
   case AMDGPU::V_MAX_I16_e32:
   case AMDGPU::V_MIN_I16_e64:
   case AMDGPU::V_MIN_I16_e32:
+  case AMDGPU::V_MAD_F16_e64:
+  case AMDGPU::V_MAD_U16_e64:
+  case AMDGPU::V_MAD_I16_e64:
+  case AMDGPU::V_FMA_F16_e64:
+  case AMDGPU::V_DIV_FIXUP_F16_e64:
     // On gfx10, all 16-bit instructions preserve the high bits.
     return getGeneration() <= AMDGPUSubtarget::GFX9;
-  case AMDGPU::V_MAD_F16_e64:
   case AMDGPU::V_MADAK_F16:
   case AMDGPU::V_MADMK_F16:
   case AMDGPU::V_MAC_F16_e64:
   case AMDGPU::V_MAC_F16_e32:
   case AMDGPU::V_FMAMK_F16:
   case AMDGPU::V_FMAAK_F16:
-  case AMDGPU::V_MAD_U16_e64:
-  case AMDGPU::V_MAD_I16_e64:
-  case AMDGPU::V_FMA_F16_e64:
   case AMDGPU::V_FMAC_F16_e64:
   case AMDGPU::V_FMAC_F16_e32:
-  case AMDGPU::V_DIV_FIXUP_F16_e64:
     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
     // instructions maintain the legacy behavior of 0ing. Some instructions
     // changed to preserving the high bits.
@@ -648,9 +648,18 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 }
 
 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
+  assert(AMDGPU::isKernel(F.getCallingConv()));
+
+  // We don't allocate the segment if we know the implicit arguments weren't
+  // used, even if the ABI implies we need them.
+  if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
+    return 0;
+
   if (isMesaKernel(F))
     return 16;
-  return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
+
+  // Assume all implicit inputs are used by default
+  return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
 }
 
 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index de11676279f2..a2c61f9da8da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -231,13 +231,6 @@ static cl::opt<bool, true> LateCFGStructurize(
   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
   cl::Hidden);
 
-static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
-  "amdgpu-fixed-function-abi",
-  cl::desc("Enable all implicit function arguments"),
-  cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
-  cl::init(false),
-  cl::Hidden);
-
 // Enable lib calls simplifications
 static cl::opt<bool> EnableLibCallSimplify(
   "amdgpu-simplify-libcall",
@@ -505,7 +498,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
 
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
-bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 0ff2db2a52d9..226646a96953 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -37,7 +37,6 @@ protected:
 public:
   static bool EnableLateStructurizeCFG;
   static bool EnableFunctionCalls;
-  static bool EnableFixedFunctionABI;
   static bool EnableLowerModuleLDS;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ecdbdf613a53..09c5eb192e1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -519,57 +519,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
     TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
     const Instruction *CxtI) {
-  EVT OrigTy = TLI->getValueType(DL, Ty);
-  if (!OrigTy.isSimple()) {
-    // FIXME: We're having to query the throughput cost so that the basic
-    // implementation tries to generate legalize and scalarization costs. Maybe
-    // we could hoist the scalarization code here?
-    if (CostKind != TTI::TCK_CodeSize)
-      return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
-                                           Opd1Info, Opd2Info, Opd1PropInfo,
-                                           Opd2PropInfo, Args, CxtI);
-    // Scalarization
-
-    // Check if any of the operands are vector operands.
-    int ISD = TLI->InstructionOpcodeToISD(Opcode);
-    assert(ISD && "Invalid opcode");
-
-    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
-
-    bool IsFloat = Ty->isFPOrFPVectorTy();
-    // Assume that floating point arithmetic operations cost twice as much as
-    // integer operations.
-    unsigned OpCost = (IsFloat ? 2 : 1);
-
-    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
-      // The operation is legal. Assume it costs 1.
-      // TODO: Once we have extract/insert subvector cost we need to use them.
-      return LT.first * OpCost;
-    }
-
-    if (!TLI->isOperationExpand(ISD, LT.second)) {
-      // If the operation is custom lowered, then assume that the code is twice
-      // as expensive.
-      return LT.first * 2 * OpCost;
-    }
-
-    // Else, assume that we need to scalarize this op.
-    // TODO: If one of the types get legalized by splitting, handle this
-    // similarly to what getCastInstrCost() does.
-    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
-      unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
-      InstructionCost Cost = getArithmeticInstrCost(
-          Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
-          Opd1PropInfo, Opd2PropInfo, Args, CxtI);
-      // Return the cost of multiple scalar invocation plus the cost of
-      // inserting and extracting the values.
-      SmallVector<Type *> Tys(Args.size(), Ty);
-      return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
-    }
-
-    // We don't know anything about this scalar instruction.
-    return OpCost;
-  }
 
   // Legalize the type.
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
@@ -742,40 +691,6 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 
   Type *RetTy = ICA.getReturnType();
-  EVT OrigTy = TLI->getValueType(DL, RetTy);
-  if (!OrigTy.isSimple()) {
-    if (CostKind != TTI::TCK_CodeSize)
-      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
-
-    // TODO: Combine these two logic paths.
-    if (ICA.isTypeBasedOnly())
-      return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
-
-    unsigned RetVF =
-        (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
-                             : 1);
-    const IntrinsicInst *I = ICA.getInst();
-    const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
-    FastMathFlags FMF = ICA.getFlags();
-    // Assume that we need to scalarize this intrinsic.
-
-    // Compute the scalarization overhead based on Args for a vector
-    // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
-    // CostModel will pass a vector RetTy and VF is 1.
-    InstructionCost ScalarizationCost = InstructionCost::getInvalid();
-    if (RetVF > 1) {
-      ScalarizationCost = 0;
-      if (!RetTy->isVoidTy())
-        ScalarizationCost +=
-            getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
-      ScalarizationCost +=
-          getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
-    }
-
-    IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
-                                  ScalarizationCost);
-    return getIntrinsicInstrCost(Attrs, CostKind);
-  }
 
   // Legalize the type.
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
diff --git a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 712f6dece911..1736c078eb83 100644
--- a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -173,10 +173,8 @@ protected:
   }
 
   static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
-    for (MachineLoop::iterator iter = LoopInfo.begin(),
-         iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
-      (*iter)->print(dbgs());
-    }
+    for (const MachineLoop *L : LoopInfo)
+      L->print(dbgs());
   }
 
   // UTILITY FUNCTIONS
@@ -691,9 +689,7 @@ bool AMDGPUCFGStructurizer::prepare() {
   SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
 
   // Add an ExitBlk to loop that don't have one
-  for (MachineLoopInfo::iterator It = MLI->begin(),
-       E = MLI->end(); It != E; ++It) {
-    MachineLoop *LoopRep = (*It);
+  for (MachineLoop *LoopRep : *MLI) {
     MBBVector ExitingMBBs;
     LoopRep->getExitingBlocks(ExitingMBBs);
 
@@ -827,14 +823,13 @@ bool AMDGPUCFGStructurizer::run() {
   wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
 
   // Detach retired Block, release memory.
-  for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
-      It != E; ++It) {
-    if ((*It).second && (*It).second->IsRetired) {
-      assert(((*It).first)->getNumber() != -1);
-      LLVM_DEBUG(dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";);
-      (*It).first->eraseFromParent();  //Remove from the parent Function.
+  for (auto &It : BlockInfoMap) {
+    if (It.second && It.second->IsRetired) {
+      assert((It.first)->getNumber() != -1);
+      LLVM_DEBUG(dbgs() << "Erase BB" << (It.first)->getNumber() << "\n";);
+      It.first->eraseFromParent(); // Remove from the parent Function.
     }
-    delete (*It).second;
+    delete It.second;
   }
   BlockInfoMap.clear();
   LLInfoMap.clear();
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 4acd77a9d5d2..2bb59086f391 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -246,8 +246,12 @@ public:
     return isRegKind() && !hasModifiers();
   }
 
+  bool isRegOrInline(unsigned RCID, MVT type) const {
+    return isRegClass(RCID) || isInlinableImm(type);
+  }
+
   bool isRegOrImmWithInputMods(unsigned RCID, MVT type) const {
-    return isRegClass(RCID) || isInlinableImm(type) || isLiteralImm(type);
+    return isRegOrInline(RCID, type) || isLiteralImm(type);
   }
 
   bool isRegOrImmWithInt16InputMods() const {
@@ -372,7 +376,7 @@ public:
   bool isInlineValue() const;
 
   bool isRegOrInlineNoMods(unsigned RCID, MVT type) const {
-    return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers();
+    return isRegOrInline(RCID, type) && !hasModifiers();
   }
 
   bool isSCSrcB16() const {
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index d3644db7cf8b..a535c8cc0918 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">;
-def MUBUFOffset : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
+def MUBUFAddr64 : ComplexPattern<iPTR, 4, "SelectMUBUFAddr64">;
+def MUBUFOffset : ComplexPattern<iPTR, 3, "SelectMUBUFOffset">;
 
-def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
-def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
+def MUBUFScratchOffen : ComplexPattern<iPTR, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
+def MUBUFScratchOffset : ComplexPattern<iPTR, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
 
 def BUFAddrKind {
   int Offset = 0;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index bb0aa648ff90..c7ec5308e6d0 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-def FlatOffset : ComplexPattern<i64, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>;
-def GlobalOffset : ComplexPattern<i64, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>;
-def ScratchOffset : ComplexPattern<i32, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>;
+def FlatOffset : ComplexPattern<iPTR, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>;
+def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>;
+def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>;
 
-def GlobalSAddr : ComplexPattern<i64, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
-def ScratchSAddr : ComplexPattern<i32, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
+def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
+def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
 
 //===----------------------------------------------------------------------===//
 // FLAT classes
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index f3f664f7972a..912bcc792e4d 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -120,8 +120,7 @@ unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
 
   // We will now look at each of the currently executing instructions
   // to find out if this wait instruction still needs to wait.
-  for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
-    const InstRef &PrevIR = *I;
+  for (const InstRef &PrevIR : IssuedInst) {
     const Instruction &PrevInst = *PrevIR.getInstruction();
     const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
     const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 29c37c706138..8a48a67b829c 100644
--- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -440,9 +440,8 @@ private:
     CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
     BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount);
-    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
-      BB->splice(InsertPos, BB, Clause.second[i]);
-    }
+    for (MachineInstr *MI : Clause.second)
+      BB->splice(InsertPos, BB, MI);
     CfCount += 2 * Clause.second.size();
   }
 
@@ -452,9 +451,8 @@ private:
     CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
     BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount);
-    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
-      BB->splice(InsertPos, BB, Clause.second[i]);
-    }
+    for (MachineInstr *MI : Clause.second)
+      BB->splice(InsertPos, BB, MI);
     CfCount += Clause.second.size();
   }
 
@@ -635,10 +633,10 @@ public:
             CfCount++;
           }
           MI->eraseFromParent();
-          for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
-            EmitFetchClause(I, DL, FetchClauses[i], CfCount);
-          for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
-            EmitALUClause(I, DL, AluClauses[i], CfCount);
+          for (ClauseFile &CF : FetchClauses)
+            EmitFetchClause(I, DL, CF, CfCount);
+          for (ClauseFile &CF : AluClauses)
+            EmitALUClause(I, DL, CF, CfCount);
           break;
         }
         default:
@@ -649,8 +647,7 @@ public:
           break;
         }
       }
-      for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
-        MachineInstr *Alu = ToPopAfter[i];
+      for (MachineInstr *Alu : ToPopAfter) {
         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
             TII->get(R600::CF_ALU_POP_AFTER))
             .addImm(Alu->getOperand(0).getImm())
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index a7ebf72315cb..aec8b1ae4837 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -268,17 +268,15 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
       {R600::OpName::src1_W, R600::OpName::src1_sel_W},
     };
 
-    for (unsigned j = 0; j < 8; j++) {
-      MachineOperand &MO =
-          MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
+    for (const auto &Op : OpTable) {
+      MachineOperand &MO = MI.getOperand(getOperandIdx(MI.getOpcode(), Op[0]));
       Register Reg = MO.getReg();
       if (Reg == R600::ALU_CONST) {
         MachineOperand &Sel =
-            MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+            MI.getOperand(getOperandIdx(MI.getOpcode(), Op[1]));
         Result.push_back(std::make_pair(&MO, Sel.getImm()));
         continue;
       }
-
     }
     return Result;
   }
@@ -289,15 +287,14 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
     {R600::OpName::src2, R600::OpName::src2_sel},
   };
 
-  for (unsigned j = 0; j < 3; j++) {
-    int SrcIdx = getOperandIdx(MI.getOpcode(), OpTable[j][0]);
+  for (const auto &Op : OpTable) {
+    int SrcIdx = getOperandIdx(MI.getOpcode(), Op[0]);
     if (SrcIdx < 0)
       break;
     MachineOperand &MO = MI.getOperand(SrcIdx);
     Register Reg = MO.getReg();
     if (Reg == R600::ALU_CONST) {
-      MachineOperand &Sel =
-          MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+      MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), Op[1]));
       Result.push_back(std::make_pair(&MO, Sel.getImm()));
       continue;
     }
@@ -521,12 +518,11 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
   ValidSwizzle.clear();
   unsigned ConstCount;
   BankSwizzle TransBS = ALU_VEC_012_SCL_210;
-  for (unsigned i = 0, e = IG.size(); i < e; ++i) {
-    IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
-    unsigned Op = getOperandIdx(IG[i]->getOpcode(),
-        R600::OpName::bank_swizzle);
-    ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
-        IG[i]->getOperand(Op).getImm());
+  for (MachineInstr *MI : IG) {
+    IGSrcs.push_back(ExtractSrcs(*MI, PV, ConstCount));
+    unsigned Op = getOperandIdx(MI->getOpcode(), R600::OpName::bank_swizzle);
+    ValidSwizzle.push_back(
+        (R600InstrInfo::BankSwizzle)MI->getOperand(Op).getImm());
   }
   std::vector<std::pair<int, unsigned>> TransOps;
   if (!isLastAluTrans)
@@ -542,8 +538,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
     ALU_VEC_120_SCL_212,
     ALU_VEC_102_SCL_221
   };
-  for (unsigned i = 0; i < 4; i++) {
-    TransBS = TransSwz[i];
+  for (R600InstrInfo::BankSwizzle TransBS : TransSwz) {
     if (!isConstCompatible(TransBS, TransOps, ConstCount))
       continue;
     bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
@@ -562,9 +557,9 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
     const {
   assert (Consts.size() <= 12 && "Too many operands in instructions group");
   unsigned Pair1 = 0, Pair2 = 0;
-  for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
-    unsigned ReadConstHalf = Consts[i] & 2;
-    unsigned ReadConstIndex = Consts[i] & (~3);
+  for (unsigned Const : Consts) {
+    unsigned ReadConstHalf = Const & 2;
+    unsigned ReadConstIndex = Const & (~3);
     unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
     if (!Pair1) {
       Pair1 = ReadHalfConst;
@@ -587,12 +582,11 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
     const {
   std::vector<unsigned> Consts;
   SmallSet<int64_t, 4> Literals;
-  for (unsigned i = 0, n = MIs.size(); i < n; i++) {
-    MachineInstr &MI = *MIs[i];
-    if (!isALUInstr(MI.getOpcode()))
+  for (MachineInstr *MI : MIs) {
+    if (!isALUInstr(MI->getOpcode()))
       continue;
 
-    for (const auto &Src : getSrcs(MI)) {
+    for (const auto &Src : getSrcs(*MI)) {
       if (Src.first->getReg() == R600::ALU_LITERAL_X)
         Literals.insert(Src.second);
       if (Literals.size() > 4)
@@ -1330,11 +1324,11 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
   MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel))
       .setReg(MO.getReg());
 
-  for (unsigned i = 0; i < 14; i++) {
+  for (unsigned Operand : Operands) {
     MachineOperand &MO = MI->getOperand(
-        getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
+        getOperandIdx(MI->getOpcode(), getSlotedOps(Operand, Slot)));
     assert (MO.isImm());
-    setImmOperand(*MIB, Operands[i], MO.getImm());
+    setImmOperand(*MIB, Operand, MO.getImm());
   }
   MIB->getOperand(20).setImm(0);
   return MIB;
diff --git a/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 6aee2f591b56..d26879ed8d60 100644
--- a/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -328,9 +328,9 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
 
 void R600SchedStrategy::LoadAlu() {
   std::vector<SUnit *> &QSrc = Pending[IDAlu];
-  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
-    AluKind AK = getAluKind(QSrc[i]);
-    AvailableAlus[AK].push_back(QSrc[i]);
+  for (SUnit *SU : QSrc) {
+    AluKind AK = getAluKind(SU);
+    AvailableAlus[AK].push_back(SU);
   }
   QSrc.clear();
 }
diff --git a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index ac6a3581e255..aa156190b7ae 100644
--- a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -307,8 +307,8 @@ class R600OpenCLImageTypeLoweringPass : public ModulePass {
     // Build new MDNode.
     SmallVector<Metadata *, 6> KernelMDArgs;
     KernelMDArgs.push_back(ConstantAsMetadata::get(NewF));
-    for (unsigned i = 0; i < NumKernelArgMDNodes; ++i)
-      KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i]));
+    for (const MDVector &MDV : NewArgMDs.ArgVector)
+      KernelMDArgs.push_back(MDNode::get(*Context, MDV));
     MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs);
 
     return std::make_tuple(NewF, NewMDNode);
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 72cf48c04e7f..795bc898a7bf 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -150,19 +150,18 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
     RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap)
     const {
   unsigned CurrentUndexIdx = 0;
-  for (DenseMap<Register, unsigned>::iterator It = ToMerge->RegToChan.begin(),
-      E = ToMerge->RegToChan.end(); It != E; ++It) {
+  for (auto &It : ToMerge->RegToChan) {
     DenseMap<Register, unsigned>::const_iterator PosInUntouched =
-        Untouched->RegToChan.find((*It).first);
+        Untouched->RegToChan.find(It.first);
     if (PosInUntouched != Untouched->RegToChan.end()) {
-      Remap.push_back(std::pair<unsigned, unsigned>
-          ((*It).second, (*PosInUntouched).second));
+      Remap.push_back(
+          std::pair<unsigned, unsigned>(It.second, (*PosInUntouched).second));
       continue;
     }
     if (CurrentUndexIdx >= Untouched->UndefReg.size())
       return false;
-    Remap.push_back(std::pair<unsigned, unsigned>
-        ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
+    Remap.push_back(std::pair<unsigned, unsigned>(
+        It.second, Untouched->UndefReg[CurrentUndexIdx++]));
   }
 
   return true;
@@ -172,9 +171,9 @@ static
 unsigned getReassignedChan(
     const std::vector<std::pair<unsigned, unsigned>> &RemapChan,
     unsigned Chan) {
-  for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
-    if (RemapChan[j].first == Chan)
-      return RemapChan[j].second;
+  for (const auto &J : RemapChan) {
+    if (J.first == Chan)
+      return J.second;
   }
   llvm_unreachable("Chan wasn't reassigned");
 }
@@ -190,11 +189,10 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
   Register SrcVec = BaseRSI->Instr->getOperand(0).getReg();
   DenseMap<Register, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
   std::vector<Register> UpdatedUndef = BaseRSI->UndefReg;
-  for (DenseMap<Register, unsigned>::iterator It = RSI->RegToChan.begin(),
-      E = RSI->RegToChan.end(); It != E; ++It) {
+  for (const auto &It : RSI->RegToChan) {
     Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
-    unsigned SubReg = (*It).first;
-    unsigned Swizzle = (*It).second;
+    unsigned SubReg = It.first;
+    unsigned Swizzle = It.second;
     unsigned Chan = getReassignedChan(RemapChan, Swizzle);
 
     MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(R600::INSERT_SUBREG),
@@ -234,14 +232,12 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
 }
 
 void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
-  for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
-      E = PreviousRegSeqByReg.end(); It != E; ++It) {
-    std::vector<MachineInstr *> &MIs = (*It).second;
+  for (auto &It : PreviousRegSeqByReg) {
+    std::vector<MachineInstr *> &MIs = It.second;
     MIs.erase(llvm::find(MIs, MI), MIs.end());
   }
-  for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
-      E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
-    std::vector<MachineInstr *> &MIs = (*It).second;
+  for (auto &It : PreviousRegSeqByUndefCount) {
+    std::vector<MachineInstr *> &MIs = It.second;
     MIs.erase(llvm::find(MIs, MI), MIs.end());
   }
 }
@@ -255,9 +251,9 @@ void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
     Offset = 3;
   for (unsigned i = 0; i < 4; i++) {
     unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
-    for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
-      if (RemapChan[j].first == Swizzle) {
-        MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
+    for (const auto &J : RemapChan) {
+      if (J.first == Swizzle) {
+        MI.getOperand(i + Offset).setImm(J.second - 1);
         break;
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index beb0aad86e89..fbe2a1cd9fba 100644
--- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -127,8 +127,8 @@ private:
       R600::OpName::src1,
       R600::OpName::src2
     };
-    for (unsigned i = 0; i < 3; i++) {
-      int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
+    for (unsigned Op : Ops) {
+      int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Op);
       if (OperandIdx < 0)
         continue;
       Register Src = MI.getOperand(OperandIdx).getReg();
diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 99a1a8e9871a..c329bae50f92 100644
--- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -54,10 +54,8 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, R600::PRED_SEL_ONE);
   reserveRegisterTuples(Reserved, R600::INDIRECT_BASE_ADDR);
 
-  for (TargetRegisterClass::iterator I = R600::R600_AddrRegClass.begin(),
-                        E = R600::R600_AddrRegClass.end(); I != E; ++I) {
-    reserveRegisterTuples(Reserved, *I);
-  }
+  for (MCPhysReg R : R600::R600_AddrRegClass)
+    reserveRegisterTuples(Reserved, R);
 
   TII->reserveIndirectRegisters(Reserved, MF, *this);
 
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 200e00ee5521..1f93284fc7ee 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1620,7 +1620,7 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
   // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
   // in which case we can erase them all later in runOnMachineFunction.
   if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
-    MI.eraseFromParentAndMarkDBGValuesForRemoval();
+    MI.eraseFromParent();
   return true;
 }
 
@@ -1821,7 +1821,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
         auto &SrcOp = InstToErase->getOperand(1);
         auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
-        InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
+        InstToErase->eraseFromParent();
         InstToErase = nullptr;
         if (!SrcReg || SrcReg.isPhysical())
           break;
@@ -1831,7 +1831,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       }
       if (InstToErase && InstToErase->isRegSequence() &&
           MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg()))
-        InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
+        InstToErase->eraseFromParent();
     }
   }
   return true;
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 4706c74be721..d4fe74ecb96e 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1167,11 +1167,13 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   if (SpillVGPRToAGPR) {
     // To track the spill frame indices handled in this pass.
     BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
+    BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
 
     bool SeenDbgInstr = false;
 
     for (MachineBasicBlock &MBB : MF) {
       for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+        int FrameIndex;
         if (MI.isDebugInstr())
           SeenDbgInstr = true;
 
@@ -1191,10 +1193,18 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
             SpillFIs.set(FI);
             continue;
           }
-        }
+        } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
+                   TII->isLoadFromStackSlot(MI, FrameIndex))
+          NonVGPRSpillFIs.set(FrameIndex);
       }
     }
 
+    // Stack slot coloring may assign different objets to the same stack slot.
+    // If not, then the VGPR to AGPR spill slot is dead.
+    for (unsigned FI : SpillFIs.set_bits())
+      if (!NonVGPRSpillFIs.test(FI))
+        FuncInfo->setVGPRToAGPRSpillDead(FI);
+
     for (MachineBasicBlock &MBB : MF) {
       for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
         MBB.addLiveIn(Reg);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 35b72f5d201b..9f138136e6e9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -2062,33 +2063,30 @@ void SITargetLowering::allocateSpecialInputSGPRs(
   SIMachineFunctionInfo &Info) const {
   auto &ArgInfo = Info.getArgInfo();
 
-  // We need to allocate these in place regardless of their use.
-  const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI;
-
   // TODO: Unify handling with private memory pointers.
-  if (IsFixed || Info.hasDispatchPtr())
+  if (Info.hasDispatchPtr())
     allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
 
-  if (IsFixed || Info.hasQueuePtr())
+  if (Info.hasQueuePtr())
     allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
 
   // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
   // constant offset from the kernarg segment.
-  if (IsFixed || Info.hasImplicitArgPtr())
+  if (Info.hasImplicitArgPtr())
     allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
 
-  if (IsFixed || Info.hasDispatchID())
+  if (Info.hasDispatchID())
     allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
 
   // flat_scratch_init is not applicable for non-kernel functions.
 
-  if (IsFixed || Info.hasWorkGroupIDX())
+  if (Info.hasWorkGroupIDX())
     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
 
-  if (IsFixed || Info.hasWorkGroupIDY())
+  if (Info.hasWorkGroupIDY())
     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
 
-  if (IsFixed || Info.hasWorkGroupIDZ())
+  if (Info.hasWorkGroupIDZ())
     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
 }
 
@@ -2419,10 +2417,9 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (IsEntryFunc) {
     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
-  } else {
+  } else if (!IsGraphics) {
     // For the fixed ABI, pass workitem IDs in the last argument register.
-    if (AMDGPUTargetMachine::EnableFixedFunctionABI)
-      allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+    allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
   }
 
   if (IsKernel) {
@@ -2549,17 +2546,13 @@ SDValue SITargetLowering::LowerFormalArguments(
     InVals.push_back(Val);
   }
 
-  if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
-    // Special inputs come after user arguments.
-    allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
-  }
-
   // Start adding system SGPRs.
   if (IsEntryFunc) {
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
-    allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
+    if (!IsGraphics)
+      allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
   auto &ArgUsageInfo =
@@ -3123,8 +3116,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
 
-  if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      CallConv != CallingConv::AMDGPU_Gfx) {
+  if (CallConv != CallingConv::AMDGPU_Gfx) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
@@ -3263,12 +3255,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     }
   }
 
-  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      CallConv != CallingConv::AMDGPU_Gfx) {
-    // Copy special input registers after user input arguments.
-    passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
-  }
-
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
@@ -6282,10 +6268,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     }
   }
 
-  // Push back extra arguments.
-  for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++)
-    VAddrs.push_back(Op.getOperand(ArgOffset + I));
-
   // Check for 16 bit addresses or derivatives and pack if true.
   MVT VAddrVT =
       Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
@@ -6298,6 +6280,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
   IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
 
+  // Push back extra arguments.
+  for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
+    if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
+      // Special handling of bias when A16 is on. Bias is of type half but
+      // occupies full 32-bit.
+      SDValue bias = DAG.getBuildVector( MVT::v2f16, DL, {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
+      VAddrs.push_back(bias);
+    } else
+      VAddrs.push_back(Op.getOperand(ArgOffset + I));
+  }
+
   if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
     // 16 bit gradients are supported, but are tied to the A16 control
     // so both gradients and addresses must be 16 bit
@@ -7502,8 +7495,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     assert(NodePtr.getValueType() == MVT::i32 ||
            NodePtr.getValueType() == MVT::i64);
-    assert(RayDir.getValueType() == MVT::v4f16 ||
-           RayDir.getValueType() == MVT::v4f32);
+    assert(RayDir.getValueType() == MVT::v3f16 ||
+           RayDir.getValueType() == MVT::v3f32);
 
     if (!Subtarget->hasGFX10_AEncoding()) {
       emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
@@ -9837,11 +9830,13 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
   if (Opcode == AMDGPU::G_FCANONICALIZE)
     return true;
 
-  if (Opcode == AMDGPU::G_FCONSTANT) {
-    auto F = MI->getOperand(1).getFPImm()->getValueAPF();
-    if (F.isNaN() && F.isSignaling())
+  Optional<FPValueAndVReg> FCR;
+  // Constant splat (can be padded with undef) or scalar constant.
+  if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
+    if (FCR->Value.isSignaling())
       return false;
-    return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF);
+    return !FCR->Value.isDenormal() ||
+           denormalsEnabledForType(MRI.getType(FCR->VReg), MF);
   }
 
   if (MaxDepth == 0)
@@ -11514,7 +11509,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
     // Prefer VGPRs over AGPRs in mAI instructions where possible.
     // This saves a chain-copy of registers and better ballance register
     // use between vgpr and agpr as agpr tuples tend to be big.
-    if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) {
+    if (MI.getDesc().OpInfo) {
       unsigned Opc = MI.getOpcode();
       const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
       for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
@@ -12477,6 +12472,6 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
   if (Size <= 256)
     return Cost;
 
-  Cost.first = (Size + 255) / 256;
+  Cost.first += (Size + 255) / 256;
   return Cost;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c9d9dd1fb82c..6fbe5d45ce0a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -30,6 +30,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/DebugCounter.h"
@@ -51,26 +52,6 @@ static cl::opt<bool> ForceEmitZeroFlag(
   cl::init(false), cl::Hidden);
 
 namespace {
-
-template <typename EnumT>
-class enum_iterator
-    : public iterator_facade_base<enum_iterator<EnumT>,
-                                  std::forward_iterator_tag, const EnumT> {
-  EnumT Value;
-public:
-  enum_iterator() = default;
-  enum_iterator(EnumT Value) : Value(Value) {}
-
-  enum_iterator &operator++() {
-    Value = static_cast<EnumT>(Value + 1);
-    return *this;
-  }
-
-  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
-
-  EnumT operator*() const { return Value; }
-};
-
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
 // s_waitcnt instruction needs to be emitted.
@@ -78,27 +59,32 @@ public:
 #define CNT_MASK(t) (1u << (t))
 
 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
+} // namespace
 
-iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
-  return make_range(enum_iterator<InstCounterType>(VM_CNT),
-                    enum_iterator<InstCounterType>(NUM_INST_CNTS));
-}
+namespace llvm {
+template <> struct enum_iteration_traits<InstCounterType> {
+  static constexpr bool is_iterable = true;
+};
+} // namespace llvm
+
+namespace {
+auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
 
 using RegInterval = std::pair<int, int>;
 
-struct {
+struct HardwareLimits {
   unsigned VmcntMax;
   unsigned ExpcntMax;
   unsigned LgkmcntMax;
   unsigned VscntMax;
-} HardwareLimits;
+};
 
-struct {
+struct RegisterEncoding {
   unsigned VGPR0;
   unsigned VGPRL;
   unsigned SGPR0;
   unsigned SGPRL;
-} RegisterEncoding;
+};
 
 enum WaitEventType {
   VMEM_ACCESS,      // vector-memory read & write
@@ -194,18 +180,20 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
 // "s_waitcnt 0" before use.
 class WaitcntBrackets {
 public:
-  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
+  WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
+                  RegisterEncoding Encoding)
+      : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
 
-  static unsigned getWaitCountMax(InstCounterType T) {
+  unsigned getWaitCountMax(InstCounterType T) const {
     switch (T) {
     case VM_CNT:
-      return HardwareLimits.VmcntMax;
+      return Limits.VmcntMax;
     case LGKM_CNT:
-      return HardwareLimits.LgkmcntMax;
+      return Limits.LgkmcntMax;
     case EXP_CNT:
-      return HardwareLimits.ExpcntMax;
+      return Limits.ExpcntMax;
     case VS_CNT:
-      return HardwareLimits.VscntMax;
+      return Limits.VscntMax;
     default:
       break;
     }
@@ -338,6 +326,8 @@ private:
                    unsigned OpNo, unsigned Val);
 
   const GCNSubtarget *ST = nullptr;
+  HardwareLimits Limits = {};
+  RegisterEncoding Encoding = {};
   unsigned ScoreLBs[NUM_INST_CNTS] = {0};
   unsigned ScoreUBs[NUM_INST_CNTS] = {0};
   unsigned PendingEvents = 0;
@@ -471,14 +461,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
   unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
 
   if (TRI->isVectorRegister(*MRI, Op.getReg())) {
-    assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
-    Result.first = Reg - RegisterEncoding.VGPR0;
+    assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
+    Result.first = Reg - Encoding.VGPR0;
     if (TRI->isAGPR(*MRI, Op.getReg()))
       Result.first += AGPR_OFFSET;
     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
   } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
-    assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
-    Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
+    assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
+    Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
     assert(Result.first >= NUM_ALL_VGPRS &&
            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
   }
@@ -1589,20 +1579,22 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   for (auto T : inst_counter_types())
     ForceEmitWaitcnt[T] = false;
 
-  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
-  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
-  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
-  HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
+  HardwareLimits Limits = {};
+  Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
+  Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
+  Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
+  Limits.VscntMax = ST->hasVscnt() ? 63 : 0;
 
   unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
   unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
   assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
   assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
 
-  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
-  RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
-  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
-  RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
+  RegisterEncoding Encoding = {};
+  Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
+  Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
+  Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
+  Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
 
   TrackedWaitcntSet.clear();
   BlockInfos.clear();
@@ -1652,9 +1644,9 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
           *Brackets = *BI.Incoming;
       } else {
         if (!Brackets)
-          Brackets = std::make_unique<WaitcntBrackets>(ST);
+          Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
         else
-          *Brackets = WaitcntBrackets(ST);
+          *Brackets = WaitcntBrackets(ST, Limits, Encoding);
       }
 
       Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
@@ -1686,45 +1678,47 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     }
   } while (Repeat);
 
-  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
-
-  bool HaveScalarStores = false;
+  if (ST->hasScalarStores()) {
+    SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+    bool HaveScalarStores = false;
 
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      if (!HaveScalarStores && TII->isScalarStore(MI))
-        HaveScalarStores = true;
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        if (!HaveScalarStores && TII->isScalarStore(MI))
+          HaveScalarStores = true;
 
-      if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
-          MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
-        EndPgmBlocks.push_back(&MBB);
+        if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+            MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
+          EndPgmBlocks.push_back(&MBB);
+      }
     }
-  }
 
-  if (HaveScalarStores) {
-    // If scalar writes are used, the cache must be flushed or else the next
-    // wave to reuse the same scratch memory can be clobbered.
-    //
-    // Insert s_dcache_wb at wave termination points if there were any scalar
-    // stores, and only if the cache hasn't already been flushed. This could be
-    // improved by looking across blocks for flushes in postdominating blocks
-    // from the stores but an explicitly requested flush is probably very rare.
-    for (MachineBasicBlock *MBB : EndPgmBlocks) {
-      bool SeenDCacheWB = false;
+    if (HaveScalarStores) {
+      // If scalar writes are used, the cache must be flushed or else the next
+      // wave to reuse the same scratch memory can be clobbered.
+      //
+      // Insert s_dcache_wb at wave termination points if there were any scalar
+      // stores, and only if the cache hasn't already been flushed. This could
+      // be improved by looking across blocks for flushes in postdominating
+      // blocks from the stores but an explicitly requested flush is probably
+      // very rare.
+      for (MachineBasicBlock *MBB : EndPgmBlocks) {
+        bool SeenDCacheWB = false;
 
-      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-           ++I) {
-        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
-          SeenDCacheWB = true;
-        else if (TII->isScalarStore(*I))
-          SeenDCacheWB = false;
+        for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+             I != E; ++I) {
+          if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+            SeenDCacheWB = true;
+          else if (TII->isScalarStore(*I))
+            SeenDCacheWB = false;
 
-        // FIXME: It would be better to insert this before a waitcnt if any.
-        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
-             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
-            !SeenDCacheWB) {
-          Modified = true;
-          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+          // FIXME: It would be better to insert this before a waitcnt if any.
+          if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+               I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
+              !SeenDCacheWB) {
+            Modified = true;
+            BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+          }
         }
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 92f5322b8ad2..1755b93538ce 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -899,8 +899,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   unsigned EltSize = 4;
   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
   if (RI.isAGPRClass(RC)) {
-    Opcode = (RI.hasVGPRs(SrcRC)) ?
-      AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
+    if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
+      Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
+    else if (RI.hasVGPRs(SrcRC))
+      Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
+    else
+      Opcode = AMDGPU::INSTRUCTION_LIST_END;
   } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
     Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
   } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
@@ -1417,6 +1421,33 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
   }
 }
 
+static unsigned getAVSpillSaveOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_AV32_SAVE;
+  case 8:
+    return AMDGPU::SI_SPILL_AV64_SAVE;
+  case 12:
+    return AMDGPU::SI_SPILL_AV96_SAVE;
+  case 16:
+    return AMDGPU::SI_SPILL_AV128_SAVE;
+  case 20:
+    return AMDGPU::SI_SPILL_AV160_SAVE;
+  case 24:
+    return AMDGPU::SI_SPILL_AV192_SAVE;
+  case 28:
+    return AMDGPU::SI_SPILL_AV224_SAVE;
+  case 32:
+    return AMDGPU::SI_SPILL_AV256_SAVE;
+  case 64:
+    return AMDGPU::SI_SPILL_AV512_SAVE;
+  case 128:
+    return AMDGPU::SI_SPILL_AV1024_SAVE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MI,
                                       Register SrcReg, bool isKill,
@@ -1463,21 +1494,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize)
-                                       : getVGPRSpillSaveOpcode(SpillSize);
+  unsigned Opcode = RI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(SpillSize)
+                    : RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize)
+                                         : getVGPRSpillSaveOpcode(SpillSize);
   MFI->setHasSpilledVGPRs();
 
-  if (RI.isVectorSuperClass(RC)) {
-    // Convert an AV spill into a VGPR spill. Introduce a copy from AV to an
-    // equivalent VGPR register beforehand. Regalloc might want to introduce
-    // AV spills only to be relevant until rewriter at which they become
-    // either spills of VGPRs or AGPRs.
-    Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
-    BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg)
-        .addReg(SrcReg, RegState::Kill);
-    SrcReg = TmpVReg;
-  }
-
   BuildMI(MBB, MI, DL, get(Opcode))
     .addReg(SrcReg, getKillRegState(isKill)) // data
     .addFrameIndex(FrameIndex)               // addr
@@ -1567,6 +1588,33 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
   }
 }
 
+static unsigned getAVSpillRestoreOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_AV32_RESTORE;
+  case 8:
+    return AMDGPU::SI_SPILL_AV64_RESTORE;
+  case 12:
+    return AMDGPU::SI_SPILL_AV96_RESTORE;
+  case 16:
+    return AMDGPU::SI_SPILL_AV128_RESTORE;
+  case 20:
+    return AMDGPU::SI_SPILL_AV160_RESTORE;
+  case 24:
+    return AMDGPU::SI_SPILL_AV192_RESTORE;
+  case 28:
+    return AMDGPU::SI_SPILL_AV224_RESTORE;
+  case 32:
+    return AMDGPU::SI_SPILL_AV256_RESTORE;
+  case 64:
+    return AMDGPU::SI_SPILL_AV512_RESTORE;
+  case 128:
+    return AMDGPU::SI_SPILL_AV1024_RESTORE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MI,
                                        Register DestReg, int FrameIndex,
@@ -1609,26 +1657,15 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
-                                       : getVGPRSpillRestoreOpcode(SpillSize);
-
-  bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
-  Register TmpReg = DestReg;
-  if (IsVectorSuperClass) {
-    // For AV classes, insert the spill restore to a VGPR followed by a copy
-    // into an equivalent AV register.
-    MachineRegisterInfo &MRI = MF->getRegInfo();
-    DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
-  }
+  unsigned Opcode = RI.isVectorSuperClass(RC)
+                        ? getAVSpillRestoreOpcode(SpillSize)
+                    : RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
+                                         : getVGPRSpillRestoreOpcode(SpillSize);
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
-    .addFrameIndex(FrameIndex)        // vaddr
-    .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
-    .addImm(0)                           // offset
-    .addMemOperand(MMO);
-
-  if (IsVectorSuperClass)
-    BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg)
-        .addReg(DestReg, RegState::Kill);
+      .addFrameIndex(FrameIndex)           // vaddr
+      .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+      .addImm(0)                           // offset
+      .addMemOperand(MMO);
 }
 
 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
@@ -2358,8 +2395,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
   auto *ShAmt = MCConstantExpr::create(32, MCCtx);
   OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
-
-  return;
 }
 
 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
@@ -3106,23 +3141,26 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
 }
 
 static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
-                           int64_t &Imm) {
+                           int64_t &Imm, MachineInstr **DefMI = nullptr) {
   if (Reg.isPhysical())
     return false;
   auto *Def = MRI.getUniqueVRegDef(Reg);
   if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
     Imm = Def->getOperand(1).getImm();
+    if (DefMI)
+      *DefMI = Def;
     return true;
   }
   return false;
 }
 
-static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm) {
+static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
+                           MachineInstr **DefMI = nullptr) {
   if (!MO->isReg())
     return false;
   const MachineFunction *MF = MO->getParent()->getParent()->getParent();
   const MachineRegisterInfo &MRI = MF->getRegInfo();
-  return getFoldableImm(MO->getReg(), MRI, Imm);
+  return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
 }
 
 static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
@@ -3195,8 +3233,20 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
       // If we have an SGPR input, we will violate the constant bus restriction.
       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
        !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
+    MachineInstr *DefMI;
+    const auto killDef = [&DefMI, &MBB, this]() -> void {
+      const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      // The only user is the instruction which will be killed.
+      if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg()))
+        return;
+      // We cannot just remove the DefMI here, calling pass will crash.
+      DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
+      for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
+        DefMI->RemoveOperand(I);
+    };
+
     int64_t Imm;
-    if (getFoldableImm(Src2, Imm)) {
+    if (getFoldableImm(Src2, Imm, &DefMI)) {
       unsigned NewOpc =
           IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
                 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
@@ -3209,13 +3259,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
         updateLiveVariables(LV, MI, *MIB);
         if (LIS)
           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+        killDef();
         return MIB;
       }
     }
     unsigned NewOpc = IsFMA
                           ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
                           : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
-    if (getFoldableImm(Src1, Imm)) {
+    if (getFoldableImm(Src1, Imm, &DefMI)) {
       if (pseudoToMCOpcode(NewOpc) != -1) {
         MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
                   .add(*Dst)
@@ -3225,10 +3276,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
         updateLiveVariables(LV, MI, *MIB);
         if (LIS)
           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+        killDef();
         return MIB;
       }
     }
-    if (getFoldableImm(Src0, Imm)) {
+    if (getFoldableImm(Src0, Imm, &DefMI)) {
       if (pseudoToMCOpcode(NewOpc) != -1 &&
           isOperandLegal(
               MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
@@ -3241,12 +3293,13 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
         updateLiveVariables(LV, MI, *MIB);
         if (LIS)
           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+        killDef();
         return MIB;
       }
     }
   }
 
-  unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64
+  unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
                                    : IsF64 ? AMDGPU::V_FMA_F64_e64
                                            : AMDGPU::V_FMA_F32_e64)
                           : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
@@ -3605,12 +3658,6 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
                             const MachineRegisterInfo &MRI) const {
   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
   // Can't shrink instruction with three operands.
-  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
-  // a special case for it.  It can only be shrunk if the third operand
-  // is vcc, and src0_modifiers and src1_modifiers are not set.
-  // We should handle this the same way we handle vopc, by addding
-  // a register allocation hint pre-regalloc and then do the shrinking
-  // post-regalloc.
   if (Src2) {
     switch (MI.getOpcode()) {
       default: return false;
@@ -4563,8 +4610,9 @@ static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
                                           unsigned RCID,
                                           bool IsAllocatable) {
   if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
-      (TID.mayLoad() || TID.mayStore() ||
-      (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
+      (((TID.mayLoad() || TID.mayStore()) &&
+        !(TID.TSFlags & SIInstrFlags::VGPRSpill)) ||
+       (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
     switch (RCID) {
     case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
     case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
@@ -5001,8 +5049,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
     --ConstantBusLimit;
   }
 
-  for (unsigned i = 0; i < 3; ++i) {
-    int Idx = VOP3Idx[i];
+  for (int Idx : VOP3Idx) {
     if (Idx == -1)
       break;
     MachineOperand &MO = MI.getOperand(Idx);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 47ee83eb9351..dda92d3d25ff 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1350,11 +1350,11 @@ def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
 // Complex patterns
 //===----------------------------------------------------------------------===//
 
-def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
-def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
-def DS128Bit8ByteAligned : ComplexPattern<i64, 3, "SelectDS128Bit8ByteAligned">;
+def DS1Addr1Offset : ComplexPattern<iPTR, 2, "SelectDS1Addr1Offset">;
+def DS64Bit4ByteAligned : ComplexPattern<iPTR, 3, "SelectDS64Bit4ByteAligned">;
+def DS128Bit8ByteAligned : ComplexPattern<iPTR, 3, "SelectDS128Bit8ByteAligned">;
 
-def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
+def MOVRELOffset : ComplexPattern<iPTR, 2, "SelectMOVRELOffset">;
 
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
 def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d55d8da8699a..636337ede000 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -761,6 +761,17 @@ defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
 defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
 defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
 
+defm SI_SPILL_AV32  : SI_SPILL_VGPR <AV_32, 1>;
+defm SI_SPILL_AV64  : SI_SPILL_VGPR <AV_64, 1>;
+defm SI_SPILL_AV96  : SI_SPILL_VGPR <AV_96, 1>;
+defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>;
+defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>;
+defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>;
+defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>;
+defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>;
+defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>;
+defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>;
+
 def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
   (outs SReg_64:$dst),
   (ins si_ga:$ptr_lo, si_ga:$ptr_hi),
@@ -2106,6 +2117,19 @@ def : GCNPat <
 } // end isWave32
 
 def : GCNPat <
+  (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),
+  (V_NOT_B32_e32 $src0)
+>;
+
+def : GCNPat <
+  (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))),
+    (REG_SEQUENCE VReg_64,
+      (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0,
+      (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1
+    )
+>;
+
+def : GCNPat <
   (f16 (sint_to_fp i1:$src)),
   (V_CVT_F16_F32_e32 (
       V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
@@ -2188,18 +2212,18 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (i1 (trunc i32:$a)),
-  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+  (i1 (DivergentUnaryFrag<trunc> i32:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
 def : GCNPat <
-  (i1 (trunc i16:$a)),
-  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
 def : GCNPat <
-  (i1 (trunc i64:$a)),
-  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
+  (i1 (DivergentUnaryFrag<trunc> i64:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1),
                     (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
 >;
 
@@ -2405,21 +2429,37 @@ def : GCNPat <
 // COPY is workaround tablegen bug from multiple outputs
 // from S_LSHL_B32's multiple outputs from implicit scc def.
 def : GCNPat <
-  (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i16 16))
 >;
 
 def : GCNPat <
-  (v2i16 (build_vector (i16 SReg_32:$src1), (i16 0))),
+  (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))),
+  (v2i16 (V_LSHLREV_B32_e64 (i16 16), SReg_32:$src1))
+>;
+
+
+def : GCNPat <
+  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
 >;
 
 def : GCNPat <
-  (v2f16 (build_vector (f16 SReg_32:$src1), (f16 FP_ZERO))),
+  (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
+  (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1))
+>;
+
+def : GCNPat <
+  (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
 >;
 
 def : GCNPat <
+  (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+  (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1))
+>;
+
+def : GCNPat <
   (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))),
   (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
 >;
@@ -2435,42 +2475,74 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (v2i16 (build_vector (i16 undef), (i16 SReg_32:$src1))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i32 16))
 >;
 
 def : GCNPat <
-  (v2f16 (build_vector (f16 undef), (f16 SReg_32:$src1))),
+  (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))),
+  (v2i16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1))
+>;
+
+
+def : GCNPat <
+  (v2f16 (UniformBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i32 16))
 >;
 
+def : GCNPat <
+  (v2f16 (DivergentBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))),
+  (v2f16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1))
+>;
+
 let SubtargetPredicate = HasVOP3PInsts in {
 def : GCNPat <
-  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
   (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
 >;
 
+def : GCNPat <
+  (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
+  (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
+>;
+
 // With multiple uses of the shift, this will duplicate the shift and
 // increase register pressure.
 def : GCNPat <
-  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
   (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1))
 >;
 
+def : GCNPat <
+  (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
+  (v2i16 (V_BFI_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0, SReg_32:$src1))
+>;
+
 
 def : GCNPat <
-  (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
                        (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
   (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1)
 >;
 
-// TODO: Should source modifiers be matched to v_pack_b32_f16?
 def : GCNPat <
-  (v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
+  (v2i16 (DivergentBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
+                       (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
+  (v2i16 (V_AND_OR_B32_e64 SReg_32:$src1, (i32 (V_MOV_B32_e32 (i32 0xffff0000))), (i32 (V_LSHRREV_B32_e64 (i32 16), SReg_32:$src0))))
+>;
+
+def : GCNPat <
+  (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
   (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
 >;
 
 def : GCNPat <
+  (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
+  (v2f16 (V_LSHL_OR_B32_e64 SReg_32:$src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0))))
+>;
+
+
+def : GCNPat <
   (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
                                          (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
   (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
@@ -2866,6 +2938,18 @@ def G_AMDGPU_UMED3 : AMDGPUGenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_AMDGPU_FMED3 : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+}
+
+def G_AMDGPU_CLAMP : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+  let hasSideEffects = 0;
+}
+
 // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
 // operand Expects a MachineMemOperand in addition to explicit
 // operands.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index c4007f56f350..3ce368ef4db9 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -62,11 +62,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   // calls.
   const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
 
-  // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
-  // have any calls.
-  const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI &&
-                           CC != CallingConv::AMDGPU_Gfx &&
-                           (!isEntryFunction() || HasCalls);
   const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
                         CC == CallingConv::SPIR_KERNEL;
 
@@ -80,7 +75,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   }
 
   if (!isEntryFunction()) {
-    if (UseFixedABI)
+    if (CC != CallingConv::AMDGPU_Gfx)
       ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
 
     // TODO: Pick a high register, and shift down, similar to a kernel.
@@ -110,20 +105,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   else if (ST.isMesaGfxShader(F))
     ImplicitBufferPtr = true;
 
-  if (UseFixedABI) {
-    DispatchPtr = true;
-    QueuePtr = true;
-    ImplicitArgPtr = true;
-    WorkGroupIDX = true;
-    WorkGroupIDY = true;
-    WorkGroupIDZ = true;
-    WorkItemIDX = true;
-    WorkItemIDY = true;
-    WorkItemIDZ = true;
-
-    // FIXME: We don't need this?
-    DispatchID = true;
-  } else if (!AMDGPU::isGraphics(CC)) {
+  if (!AMDGPU::isGraphics(CC)) {
     if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
       WorkGroupIDX = true;
 
@@ -462,7 +444,7 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
       MFI.setStackID(i, TargetStackID::Default);
 
   for (auto &R : VGPRToAGPRSpills) {
-    if (R.second.FullyAllocated)
+    if (R.second.IsDead)
       MFI.RemoveStackObject(R.first);
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index c305bc20e40d..8accbf611c5f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -465,6 +465,7 @@ public:
   struct VGPRSpillToAGPR {
     SmallVector<MCPhysReg, 32> Lanes;
     bool FullyAllocated = false;
+    bool IsDead = false;
   };
 
   // Map WWM VGPR to a stack slot that is used to save/restore it in the
@@ -546,6 +547,12 @@ public:
                                          : I->second.Lanes[Lane];
   }
 
+  void setVGPRToAGPRSpillDead(int FrameIndex) {
+    auto I = VGPRToAGPRSpills.find(FrameIndex);
+    if (I != VGPRToAGPRSpills.end())
+      I->second.IsDead = true;
+  }
+
   bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
                                  unsigned NumLane) const;
   bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 5590d84cc3ab..81db66a98ddf 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -869,29 +869,27 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() {
 }
 
 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
-  unsigned DAGSize = DAG->SUnits.size();
   std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
 
   // Every combination of colors given by the top down
   // and bottom up Reserved node dependency
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
+  for (const SUnit &SU : DAG->SUnits) {
     std::pair<unsigned, unsigned> SUColors;
 
     // High latency instructions: already given.
-    if (CurrentColoring[SU->NodeNum])
+    if (CurrentColoring[SU.NodeNum])
       continue;
 
-    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
-    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
+    SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum];
+    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum];
 
     std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
       ColorCombinations.find(SUColors);
     if (Pos != ColorCombinations.end()) {
-      CurrentColoring[SU->NodeNum] = Pos->second;
+      CurrentColoring[SU.NodeNum] = Pos->second;
     } else {
-      CurrentColoring[SU->NodeNum] = NextNonReservedID;
+      CurrentColoring[SU.NodeNum] = NextNonReservedID;
       ColorCombinations[SUColors] = NextNonReservedID++;
     }
   }
@@ -1232,15 +1230,13 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
   }
 
   // Free root and leafs of all blocks to enable scheduling inside them.
-  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = CurrentBlocks[i];
+  for (SIScheduleBlock *Block : CurrentBlocks)
     Block->finalizeUnits();
-  }
-  LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
-             for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-               SIScheduleBlock *Block = CurrentBlocks[i];
-               Block->printDebug(true);
-             });
+  LLVM_DEBUG({
+    dbgs() << "Blocks created:\n\n";
+    for (SIScheduleBlock *Block : CurrentBlocks)
+      Block->printDebug(true);
+  });
 }
 
 // Two functions taken from Codegen/MachineScheduler.cpp
@@ -1379,9 +1375,9 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
     }
   }
 
-  LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = CurrentBlocks[i];
-    Block->printDebug(true);
+  LLVM_DEBUG({
+    for (SIScheduleBlock *Block : CurrentBlocks)
+      Block->printDebug(true);
   });
 }
 
@@ -1437,8 +1433,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   // found for several parents, we increment the usage of the one with the
   // highest topological index.
   LiveOutRegsNumUsages.resize(Blocks.size());
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = Blocks[i];
+  for (SIScheduleBlock *Block : Blocks) {
     for (unsigned Reg : Block->getInRegs()) {
       bool Found = false;
       int topoInd = -1;
@@ -1502,8 +1497,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
 
   // Fill LiveRegsConsumers for regs that were already
   // defined before scheduling.
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = Blocks[i];
+  for (SIScheduleBlock *Block : Blocks) {
     for (unsigned Reg : Block->getInRegs()) {
       bool Found = false;
       for (SIScheduleBlock* Pred: Block->getPreds()) {
@@ -1700,10 +1694,7 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
   decreaseLiveRegs(Block, Block->getInRegs());
   addLiveRegs(Block->getOutRegs());
   releaseBlockSuccs(Block);
-  for (std::map<unsigned, unsigned>::iterator RegI =
-       LiveOutRegsNumUsages[Block->getID()].begin(),
-       E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
-    std::pair<unsigned, unsigned> RegP = *RegI;
+  for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) {
     // We produce this register, thus it must not be previously alive.
     assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
            LiveRegsConsumers[RegP.first] == 0);
@@ -1759,8 +1750,7 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
 
   ScheduledBlocks = Scheduler.getBlocks();
 
-  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
-    SIScheduleBlock *Block = ScheduledBlocks[b];
+  for (SIScheduleBlock *Block : ScheduledBlocks) {
     std::vector<SUnit*> SUs = Block->getScheduledUnits();
 
     for (SUnit* SU : SUs)
@@ -2000,9 +1990,8 @@ void SIScheduleDAGMI::schedule()
   assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
   TopRPTracker.setPos(CurrentTop);
 
-  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
-       E = ScheduledSUnits.end(); I != E; ++I) {
-    SUnit *SU = &SUnits[*I];
+  for (unsigned I : ScheduledSUnits) {
+    SUnit *SU = &SUnits[I];
 
     scheduleMI(SU, true);
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index a1d9a23a5084..21aed4ececb5 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -210,6 +210,7 @@ struct SGPRSpillBuilder {
       auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
       if (!TmpVGPRLive)
         I.addReg(TmpVGPR, RegState::ImplicitDefine);
+      I->getOperand(2).setIsDead(true); // Mark SCC as dead.
       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
     }
   }
@@ -242,9 +243,10 @@ struct SGPRSpillBuilder {
       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
                                   /*IsKill*/ false);
       auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
-      if (!TmpVGPRLive) {
+      if (!TmpVGPRLive)
         I.addReg(TmpVGPR, RegState::ImplicitKill);
-      }
+      I->getOperand(2).setIsDead(true); // Mark SCC as dead.
+
       // Restore active lanes
       if (TmpVGPRLive)
         TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
@@ -267,9 +269,11 @@ struct SGPRSpillBuilder {
       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
                                   /*IsKill*/ false);
       // Spill inactive lanes
-      BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
-      BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
     }
   }
 
@@ -908,6 +912,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V1024_RESTORE:
   case AMDGPU::SI_SPILL_A1024_SAVE:
   case AMDGPU::SI_SPILL_A1024_RESTORE:
+  case AMDGPU::SI_SPILL_AV1024_SAVE:
+  case AMDGPU::SI_SPILL_AV1024_RESTORE:
     return 32;
   case AMDGPU::SI_SPILL_S512_SAVE:
   case AMDGPU::SI_SPILL_S512_RESTORE:
@@ -915,6 +921,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V512_RESTORE:
   case AMDGPU::SI_SPILL_A512_SAVE:
   case AMDGPU::SI_SPILL_A512_RESTORE:
+  case AMDGPU::SI_SPILL_AV512_SAVE:
+  case AMDGPU::SI_SPILL_AV512_RESTORE:
     return 16;
   case AMDGPU::SI_SPILL_S256_SAVE:
   case AMDGPU::SI_SPILL_S256_RESTORE:
@@ -922,6 +930,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V256_RESTORE:
   case AMDGPU::SI_SPILL_A256_SAVE:
   case AMDGPU::SI_SPILL_A256_RESTORE:
+  case AMDGPU::SI_SPILL_AV256_SAVE:
+  case AMDGPU::SI_SPILL_AV256_RESTORE:
     return 8;
   case AMDGPU::SI_SPILL_S224_SAVE:
   case AMDGPU::SI_SPILL_S224_RESTORE:
@@ -929,6 +939,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V224_RESTORE:
   case AMDGPU::SI_SPILL_A224_SAVE:
   case AMDGPU::SI_SPILL_A224_RESTORE:
+  case AMDGPU::SI_SPILL_AV224_SAVE:
+  case AMDGPU::SI_SPILL_AV224_RESTORE:
     return 7;
   case AMDGPU::SI_SPILL_S192_SAVE:
   case AMDGPU::SI_SPILL_S192_RESTORE:
@@ -936,6 +948,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V192_RESTORE:
   case AMDGPU::SI_SPILL_A192_SAVE:
   case AMDGPU::SI_SPILL_A192_RESTORE:
+  case AMDGPU::SI_SPILL_AV192_SAVE:
+  case AMDGPU::SI_SPILL_AV192_RESTORE:
     return 6;
   case AMDGPU::SI_SPILL_S160_SAVE:
   case AMDGPU::SI_SPILL_S160_RESTORE:
@@ -943,6 +957,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V160_RESTORE:
   case AMDGPU::SI_SPILL_A160_SAVE:
   case AMDGPU::SI_SPILL_A160_RESTORE:
+  case AMDGPU::SI_SPILL_AV160_SAVE:
+  case AMDGPU::SI_SPILL_AV160_RESTORE:
     return 5;
   case AMDGPU::SI_SPILL_S128_SAVE:
   case AMDGPU::SI_SPILL_S128_RESTORE:
@@ -950,6 +966,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V128_RESTORE:
   case AMDGPU::SI_SPILL_A128_SAVE:
   case AMDGPU::SI_SPILL_A128_RESTORE:
+  case AMDGPU::SI_SPILL_AV128_SAVE:
+  case AMDGPU::SI_SPILL_AV128_RESTORE:
     return 4;
   case AMDGPU::SI_SPILL_S96_SAVE:
   case AMDGPU::SI_SPILL_S96_RESTORE:
@@ -957,6 +975,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V96_RESTORE:
   case AMDGPU::SI_SPILL_A96_SAVE:
   case AMDGPU::SI_SPILL_A96_RESTORE:
+  case AMDGPU::SI_SPILL_AV96_SAVE:
+  case AMDGPU::SI_SPILL_AV96_RESTORE:
     return 3;
   case AMDGPU::SI_SPILL_S64_SAVE:
   case AMDGPU::SI_SPILL_S64_RESTORE:
@@ -964,6 +984,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V64_RESTORE:
   case AMDGPU::SI_SPILL_A64_SAVE:
   case AMDGPU::SI_SPILL_A64_RESTORE:
+  case AMDGPU::SI_SPILL_AV64_SAVE:
+  case AMDGPU::SI_SPILL_AV64_RESTORE:
     return 2;
   case AMDGPU::SI_SPILL_S32_SAVE:
   case AMDGPU::SI_SPILL_S32_RESTORE:
@@ -971,6 +993,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V32_RESTORE:
   case AMDGPU::SI_SPILL_A32_SAVE:
   case AMDGPU::SI_SPILL_A32_RESTORE:
+  case AMDGPU::SI_SPILL_AV32_SAVE:
+  case AMDGPU::SI_SPILL_AV32_RESTORE:
     return 1;
   default: llvm_unreachable("Invalid spill opcode");
   }
@@ -1240,9 +1264,10 @@ void SIRegisterInfo::buildSpillLoadStore(
     if (ScratchOffsetReg == AMDGPU::NoRegister) {
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
     } else {
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
+      auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
           .addReg(ScratchOffsetReg)
           .addImm(Offset);
+      Add->getOperand(3).setIsDead(); // Mark SCC as dead.
     }
 
     Offset = 0;
@@ -1810,7 +1835,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_A128_SAVE:
     case AMDGPU::SI_SPILL_A96_SAVE:
     case AMDGPU::SI_SPILL_A64_SAVE:
-    case AMDGPU::SI_SPILL_A32_SAVE: {
+    case AMDGPU::SI_SPILL_A32_SAVE:
+    case AMDGPU::SI_SPILL_AV1024_SAVE:
+    case AMDGPU::SI_SPILL_AV512_SAVE:
+    case AMDGPU::SI_SPILL_AV256_SAVE:
+    case AMDGPU::SI_SPILL_AV224_SAVE:
+    case AMDGPU::SI_SPILL_AV192_SAVE:
+    case AMDGPU::SI_SPILL_AV160_SAVE:
+    case AMDGPU::SI_SPILL_AV128_SAVE:
+    case AMDGPU::SI_SPILL_AV96_SAVE:
+    case AMDGPU::SI_SPILL_AV64_SAVE:
+    case AMDGPU::SI_SPILL_AV32_SAVE: {
       const MachineOperand *VData = TII->getNamedOperand(*MI,
                                                          AMDGPU::OpName::vdata);
       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
@@ -1846,7 +1881,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_A224_RESTORE:
     case AMDGPU::SI_SPILL_A256_RESTORE:
     case AMDGPU::SI_SPILL_A512_RESTORE:
-    case AMDGPU::SI_SPILL_A1024_RESTORE: {
+    case AMDGPU::SI_SPILL_A1024_RESTORE:
+    case AMDGPU::SI_SPILL_AV32_RESTORE:
+    case AMDGPU::SI_SPILL_AV64_RESTORE:
+    case AMDGPU::SI_SPILL_AV96_RESTORE:
+    case AMDGPU::SI_SPILL_AV128_RESTORE:
+    case AMDGPU::SI_SPILL_AV160_RESTORE:
+    case AMDGPU::SI_SPILL_AV192_RESTORE:
+    case AMDGPU::SI_SPILL_AV224_RESTORE:
+    case AMDGPU::SI_SPILL_AV256_RESTORE:
+    case AMDGPU::SI_SPILL_AV512_RESTORE:
+    case AMDGPU::SI_SPILL_AV1024_RESTORE: {
       const MachineOperand *VData = TII->getNamedOperand(*MI,
                                                          AMDGPU::OpName::vdata);
       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 3a372d4519fb..c8f1daf26de9 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -731,11 +731,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
       }
 
-      // getVOPe32 could be -1 here if we started with an instruction that had
-      // a 32-bit encoding and then commuted it to an instruction that did not.
-      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
-        continue;
-
       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
 
       if (TII->isVOPC(Op32)) {
@@ -776,10 +771,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       const MachineOperand *SDst = TII->getNamedOperand(MI,
                                                         AMDGPU::OpName::sdst);
 
-      // Check the carry-in operand for v_addc_u32_e64.
-      const MachineOperand *Src2 = TII->getNamedOperand(MI,
-                                                        AMDGPU::OpName::src2);
-
       if (SDst) {
         bool Next = false;
 
@@ -791,6 +782,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 
         // All of the instructions with carry outs also have an SGPR input in
         // src2.
+        const MachineOperand *Src2 = TII->getNamedOperand(MI,
+                                                          AMDGPU::OpName::src2);
         if (Src2 && Src2->getReg() != VCCReg) {
           if (Src2->getReg().isVirtual())
             MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 46012e5d7d97..77ee3c0ff0e4 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -495,11 +495,10 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
   // instruction as needing e.g. WQM before visiting it and realizing it needs
   // WQM disabled.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
-  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
-    MachineBasicBlock &MBB = **BI;
-    BlockInfo &BBI = Blocks[&MBB];
+  for (MachineBasicBlock *MBB : RPOT) {
+    BlockInfo &BBI = Blocks[MBB];
 
-    for (MachineInstr &MI : MBB) {
+    for (MachineInstr &MI : *MBB) {
       InstrInfo &III = Instructions[&MI];
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;
@@ -561,7 +560,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
           BBI.InNeeds |= StateExact;
-          Worklist.push_back(&MBB);
+          Worklist.push_back(MBB);
         }
         GlobalFlags |= StateExact;
         III.Disabled = StateWQM | StateStrict;
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 8502ed61b366..184c871db775 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -181,15 +181,8 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pse
   " $sdst", [(set i64:$sdst, (node))]> {
   let hasSideEffects = 1;
 
-  // FIXME: This should be definitively mayStore = 0. TableGen
-  // brokenly tries to infer these based on the intrinsic properties
-  // corresponding to the IR attributes. The target intrinsics are
-  // considered as writing to memory for IR dependency purposes, but
-  // those can be modeled with hasSideEffects here. These also end up
-  // inferring differently for llvm.readcyclecounter and the amdgcn
-  // intrinsics.
-  let mayStore = ?;
-  let mayLoad = 1;
+  let mayStore = 0;
+  let mayLoad = 0;
   let has_sbase = 0;
   let has_offset = 0;
 }
@@ -765,11 +758,11 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL
   }];
 }
 
-def SMRDImm         : ComplexPattern<i64, 2, "SelectSMRDImm">;
-def SMRDImm32       : ComplexPattern<i64, 2, "SelectSMRDImm32">;
-def SMRDSgpr        : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
-def SMRDBufferImm   : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
-def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+def SMRDImm         : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
+def SMRDImm32       : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
+def SMRDSgpr        : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm   : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
 
 multiclass SMRD_Pattern <string Instr, ValueType vt> {
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 61ecc13620a1..1713586dcf5b 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -157,6 +157,42 @@ class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> :
   let has_sdst = 0;
 }
 
+class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0),
+  (Op $src0),
+  [{ return !N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
+class UniformBinFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0, node:$src1),
+  (Op $src0, $src1),
+  [{ return !N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
+class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0, node:$src1),
+  (Op $src0, $src1),
+  [{ return N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
 
 let isMoveImm = 1 in {
   let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
@@ -172,11 +208,11 @@ let isMoveImm = 1 in {
 
 let Defs = [SCC] in {
   def S_NOT_B32 : SOP1_32 <"s_not_b32",
-    [(set i32:$sdst, (not i32:$src0))]
+    [(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))]
   >;
 
   def S_NOT_B64 : SOP1_64 <"s_not_b64",
-    [(set i64:$sdst, (not i64:$src0))]
+    [(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))]
   >;
   def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
   def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
@@ -221,22 +257,22 @@ let isReMaterializable = 1 in {
 def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
 def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
 def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",
-  [(set i32:$sdst, (AMDGPUffbl_b32 i64:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i64:$src0))]
 >;
 
 def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
-  [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i32:$src0))]
 >;
 
 def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
-  [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i32:$src0))]
 >;
 
 def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64",
-  [(set i32:$sdst, (AMDGPUffbh_u32 i64:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i64:$src0))]
 >;
 def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
-  [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_i32> i32:$src0))]
 >;
 def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
 def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
@@ -426,41 +462,6 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
   "$sdst, $src0, $src1", pattern
 >;
 
-class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
-  (ops node:$src0),
-  (Op $src0),
-  [{ return !N->isDivergent(); }]> {
-  // This check is unnecessary as it's captured by the result register
-  // bank constraint.
-  //
-  // FIXME: Should add a way for the emitter to recognize this is a
-  // trivially true predicate to eliminate the check.
-  let GISelPredicateCode = [{return true;}];
-}
-
-class UniformBinFrag<SDPatternOperator Op> : PatFrag <
-  (ops node:$src0, node:$src1),
-  (Op $src0, $src1),
-  [{ return !N->isDivergent(); }]> {
-  // This check is unnecessary as it's captured by the result register
-  // bank constraint.
-  //
-  // FIXME: Should add a way for the emitter to recognize this is a
-  // trivially true predicate to eliminate the check.
-  let GISelPredicateCode = [{return true;}];
-}
-
-class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
-  (ops node:$src0, node:$src1),
-  (Op $src0, $src1),
-  [{ return N->isDivergent(); }]> {
-  // This check is unnecessary as it's captured by the result register
-  // bank constraint.
-  //
-  // FIXME: Should add a way for the emitter to recognize this is a
-  // trivially true predicate to eliminate the check.
-  let GISelPredicateCode = [{return true;}];
-}
 
 let Defs = [SCC] in { // Carry out goes to SCC
 let isCommutable = 1 in {
@@ -485,19 +486,18 @@ def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
   [(set i32:$sdst, (UniformBinFrag<sube> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
 } // End Uses = [SCC]
 
-
 let isCommutable = 1 in {
 def S_MIN_I32 : SOP2_32 <"s_min_i32",
-  [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))]
 >;
 def S_MIN_U32 : SOP2_32 <"s_min_u32",
-  [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))]
 >;
 def S_MAX_I32 : SOP2_32 <"s_max_i32",
-  [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))]
 >;
 def S_MAX_U32 : SOP2_32 <"s_max_u32",
-  [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))]
 >;
 } // End isCommutable = 1
 } // End Defs = [SCC]
@@ -870,7 +870,7 @@ def S_GETREG_B32 : SOPK_Pseudo <
 }
 } // End mayLoad = 1
 
-let mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE] in {
+let Defs = [MODE], Uses = [MODE] in {
 
 // FIXME: Need to truncate immediate to 16-bits.
 class S_SETREG_B32_Pseudo <list<dag> pattern=[]> : SOPK_Pseudo <
@@ -914,7 +914,7 @@ def S_SETREG_IMM32_B32_mode : S_SETREG_IMM32_B32_Pseudo {
   let hasSideEffects = 0;
 }
 
-} // End mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE]
+} // End Defs = [MODE], Uses = [MODE]
 
 class SOPK_WAITCNT<string opName, list<dag> pat=[]> :
     SOPK_Pseudo<
@@ -1264,7 +1264,7 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
   let mayStore = 1;
 }
 
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+let hasSideEffects = 1 in
 def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16",
     [(int_amdgcn_s_waitcnt timm:$simm16)]>;
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
@@ -1278,8 +1278,6 @@ def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
 def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16),
   "$simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
   let hasSideEffects = 1;
-  let mayLoad = 0;
-  let mayStore = 0;
 }
 
 def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">;
@@ -1305,14 +1303,10 @@ def S_ICACHE_INV : SOPP_Pseudo <"s_icache_inv", (ins)> {
 def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm16",
   [(int_amdgcn_s_incperflevel timm:$simm16)]> {
   let hasSideEffects = 1;
-  let mayLoad = 0;
-  let mayStore = 0;
 }
 def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16",
   [(int_amdgcn_s_decperflevel timm:$simm16)]> {
   let hasSideEffects = 1;
-  let mayLoad = 0;
-  let mayStore = 0;
 }
 def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> {
   let simm16 = 0;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
index 2e4d83fbbc39..a83ff6667956 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -15,7 +15,6 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/ReplaceConstant.h"
 
@@ -25,175 +24,6 @@ namespace llvm {
 
 namespace AMDGPU {
 
-// An helper class for collecting all reachable callees for each kernel defined
-// within the module.
-class CollectReachableCallees {
-  Module &M;
-  CallGraph CG;
-  SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
-
-  // Collect all address taken functions within the module.
-  void collectAddressTakenFunctions() {
-    auto *ECNode = CG.getExternalCallingNode();
-
-    for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {
-      auto *CGN = GI->second;
-      auto *F = CGN->getFunction();
-      if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))
-        continue;
-      AddressTakenFunctions.insert(CGN);
-    }
-  }
-
-  // For given kernel, collect all its reachable non-kernel functions.
-  SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
-    SmallPtrSet<Function *, 8> ReachableCallees;
-
-    // Call graph node which represents this kernel.
-    auto *KCGN = CG[K];
-
-    // Go through all call graph nodes reachable from the node representing this
-    // kernel, visit all their call sites, if the call site is direct, add
-    // corresponding callee to reachable callee set, if it is indirect, resolve
-    // the indirect call site to potential reachable callees, add them to
-    // reachable callee set, and repeat the process for the newly added
-    // potential callee nodes.
-    //
-    // FIXME: Need to handle bit-casted function pointers.
-    //
-    SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN));
-    SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
-    while (!CGNStack.empty()) {
-      auto *CGN = CGNStack.pop_back_val();
-
-      if (!VisitedCGNodes.insert(CGN).second)
-        continue;
-
-      // Ignore call graph node which does not have associated function or
-      // associated function is not a definition.
-      if (!CGN->getFunction() || CGN->getFunction()->isDeclaration())
-        continue;
-
-      for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
-        auto *RCB = cast<CallBase>(GI->first.getValue());
-        auto *RCGN = GI->second;
-
-        if (auto *DCallee = RCGN->getFunction()) {
-          ReachableCallees.insert(DCallee);
-        } else if (RCB->isIndirectCall()) {
-          auto *RCBFTy = RCB->getFunctionType();
-          for (auto *ACGN : AddressTakenFunctions) {
-            auto *ACallee = ACGN->getFunction();
-            if (ACallee->getFunctionType() == RCBFTy) {
-              ReachableCallees.insert(ACallee);
-              CGNStack.append(df_begin(ACGN), df_end(ACGN));
-            }
-          }
-        }
-      }
-    }
-
-    return ReachableCallees;
-  }
-
-public:
-  explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
-    // Collect address taken functions.
-    collectAddressTakenFunctions();
-  }
-
-  void collectReachableCallees(
-      DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
-    // Collect reachable callee set for each kernel defined in the module.
-    for (Function &F : M.functions()) {
-      if (!AMDGPU::isKernelCC(&F))
-        continue;
-      Function *K = &F;
-      KernelToCallees[K] = collectReachableCallees(K);
-    }
-  }
-};
-
-void collectReachableCallees(
-    Module &M,
-    DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
-  CollectReachableCallees CRC{M};
-  CRC.collectReachableCallees(KernelToCallees);
-}
-
-SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
-  SmallPtrSet<Function *, 8> LDSAccessors;
-  SmallVector<User *, 8> UserStack(GV->users());
-  SmallPtrSet<User *, 8> VisitedUsers;
-
-  while (!UserStack.empty()) {
-    auto *U = UserStack.pop_back_val();
-
-    // `U` is already visited? continue to next one.
-    if (!VisitedUsers.insert(U).second)
-      continue;
-
-    // `U` is a global variable which is initialized with LDS. Ignore LDS.
-    if (isa<GlobalValue>(U))
-      return SmallPtrSet<Function *, 8>();
-
-    // Recursively explore constant users.
-    if (isa<Constant>(U)) {
-      append_range(UserStack, U->users());
-      continue;
-    }
-
-    // `U` should be an instruction, if it belongs to a non-kernel function F,
-    // then collect F.
-    Function *F = cast<Instruction>(U)->getFunction();
-    if (!AMDGPU::isKernelCC(F))
-      LDSAccessors.insert(F);
-  }
-
-  return LDSAccessors;
-}
-
-DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
-getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
-  DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
-  SmallVector<User *, 8> UserStack;
-  SmallPtrSet<User *, 8> VisitedUsers;
-
-  UserStack.push_back(U);
-
-  while (!UserStack.empty()) {
-    auto *UU = UserStack.pop_back_val();
-
-    if (!VisitedUsers.insert(UU).second)
-      continue;
-
-    if (isa<GlobalValue>(UU))
-      continue;
-
-    if (isa<Constant>(UU)) {
-      append_range(UserStack, UU->users());
-      continue;
-    }
-
-    auto *I = cast<Instruction>(UU);
-    Function *F = I->getFunction();
-    if (CollectKernelInsts) {
-      if (!AMDGPU::isKernelCC(F)) {
-        continue;
-      }
-    } else {
-      if (AMDGPU::isKernelCC(F)) {
-        continue;
-      }
-    }
-
-    FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
-    FunctionToInsts[F].insert(I);
-  }
-
-  return FunctionToInsts;
-}
-
 bool isKernelCC(const Function *Func) {
   return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
 }
@@ -232,26 +62,8 @@ void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
   }
 }
 
-bool hasUserInstruction(const GlobalValue *GV) {
-  SmallPtrSet<const User *, 8> Visited;
-  SmallVector<const User *, 16> Stack(GV->users());
-
-  while (!Stack.empty()) {
-    const User *U = Stack.pop_back_val();
-
-    if (!Visited.insert(U).second)
-      continue;
-
-    if (isa<Instruction>(U))
-      return true;
-
-    append_range(Stack, U->users());
-  }
-
-  return false;
-}
-
-bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
+static bool shouldLowerLDSToStruct(const GlobalVariable &GV,
+                                   const Function *F) {
   // We are not interested in kernel LDS lowering for module LDS itself.
   if (F && GV.getName() == "llvm.amdgcn.module.lds")
     return false;
@@ -259,7 +71,6 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
   bool Ret = false;
   SmallPtrSet<const User *, 8> Visited;
   SmallVector<const User *, 16> Stack(GV.users());
-  SmallPtrSet<const GlobalValue *, 8> GlobalUsers;
 
   assert(!F || isKernelCC(F));
 
@@ -267,15 +78,10 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
     const User *V = Stack.pop_back_val();
     Visited.insert(V);
 
-    if (auto *G = dyn_cast<GlobalValue>(V)) {
-      StringRef GName = G->getName();
-      if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {
-        // For kernel LDS lowering, if G is not a compiler.used list, then we
-        // cannot lower the lds GV since we cannot replace the use of GV within
-        // G.
-        return false;
-      }
-      GlobalUsers.insert(G);
+    if (isa<GlobalValue>(V)) {
+      // This use of the LDS variable is the initializer of a global variable.
+      // This is ill formed. The address of an LDS variable is kernel dependent
+      // and unknown until runtime. It can't be written to a global variable.
       continue;
     }
 
@@ -297,15 +103,6 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
     append_range(Stack, V->users());
   }
 
-  if (!F && !Ret) {
-    // For module LDS lowering, we have not yet decided if we should lower GV or
-    // not. Explore all global users of GV, and check if atleast one of these
-    // global users appear as an use within an instruction (possibly nested use
-    // via constant expression), if so, then conservately lower LDS.
-    for (auto *G : GlobalUsers)
-      Ret |= hasUserInstruction(G);
-  }
-
   return Ret;
 }
 
@@ -324,7 +121,7 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M,
       continue;
     }
     if (!isa<UndefValue>(GV.getInitializer())) {
-      // Initializers are unimplemented for local address space.
+      // Initializers are unimplemented for LDS address space.
       // Leave such variables in place for consistent error reporting.
       continue;
     }
@@ -342,20 +139,6 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M,
   return LocalVars;
 }
 
-SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
-  SmallPtrSet<GlobalValue *, 32> UsedList;
-
-  SmallVector<GlobalValue *, 32> TmpVec;
-  collectUsedGlobalVariables(M, TmpVec, true);
-  UsedList.insert(TmpVec.begin(), TmpVec.end());
-
-  TmpVec.clear();
-  collectUsedGlobalVariables(M, TmpVec, false);
-  UsedList.insert(TmpVec.begin(), TmpVec.end());
-
-  return UsedList;
-}
-
 } // end namespace AMDGPU
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
index d1c9229bc336..83ef68cc3f60 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -22,44 +22,13 @@ class ConstantExpr;
 
 namespace AMDGPU {
 
-/// Collect reachable callees for each kernel defined in the module \p M and
-/// return collected callees at \p KernelToCallees.
-void collectReachableCallees(
-    Module &M,
-    DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees);
-
-/// For the given LDS global \p GV, visit all its users and collect all
-/// non-kernel functions within which \p GV is used and return collected list of
-/// such non-kernel functions.
-SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);
-
-/// Collect all the instructions where user \p U belongs to. \p U could be
-/// instruction itself or it could be a constant expression which is used within
-/// an instruction. If \p CollectKernelInsts is true, collect instructions only
-/// from kernels, otherwise collect instructions only from non-kernel functions.
-DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
-getFunctionToInstsMap(User *U, bool CollectKernelInsts);
-
 bool isKernelCC(const Function *Func);
 
 Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
 
-/// \returns true if a given global variable \p GV (or its global users) appear
-/// as an use within some instruction (either from kernel or from non-kernel).
-bool hasUserInstruction(const GlobalValue *GV);
-
-/// \returns true if an LDS global requires lowering to a module LDS structure
-/// if \p F is not given. If \p F is given it must be a kernel and function
-/// \returns true if an LDS global is directly used from that kernel and it
-/// is safe to replace its uses with a kernel LDS structure member.
-bool shouldLowerLDSToStruct(const GlobalVariable &GV,
-                            const Function *F = nullptr);
-
 std::vector<GlobalVariable *> findVariablesToLower(Module &M,
                                                    const Function *F = nullptr);
 
-SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
-
 /// Replace all uses of constant \p C with instructions in \p F.
 void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
 } // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index a3eccf13cd71..a8368892c565 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -794,6 +794,18 @@ class VOPPatGen<SDPatternOperator Op, VOPProfile P> {
   list<dag> ret =  [!con(Outs, (set Ins))];
 }
 
+class DivergentUnaryFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0),
+  (Op $src0),
+  [{ return N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
 class VOPPatOrNull<SDPatternOperator Op, VOPProfile P> {
   list<dag> ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen<Op, P>.ret, []);
 }
diff --git a/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/llvm/lib/Target/ARM/A15SDOptimizer.cpp
index f4d0f4a6d6b0..d0efecad63bc 100644
--- a/llvm/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -592,16 +592,15 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
   SmallVector<unsigned, 8> Defs = getReadDPRs(MI);
   bool Modified = false;
 
-  for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end();
-     I != E; ++I) {
+  for (unsigned I : Defs) {
     // Follow the def-use chain for this DPR through COPYs, and also through
     // PHIs (which are essentially multi-way COPYs). It is because of PHIs that
     // we can end up with multiple defs of this DPR.
 
     SmallVector<MachineInstr *, 8> DefSrcs;
-    if (!Register::isVirtualRegister(*I))
+    if (!Register::isVirtualRegister(I))
       continue;
-    MachineInstr *Def = MRI->getVRegDef(*I);
+    MachineInstr *Def = MRI->getVRegDef(I);
     if (!Def)
       continue;
 
@@ -628,18 +627,17 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
 
       if (NewReg != 0) {
         Modified = true;
-        for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(),
-               E = Uses.end(); I != E; ++I) {
+        for (MachineOperand *Use : Uses) {
           // Make sure to constrain the register class of the new register to
           // match what we're replacing. Otherwise we can optimize a DPR_VFP2
           // reference into a plain DPR, and that will end poorly. NewReg is
           // always virtual here, so there will always be a matching subclass
           // to find.
-          MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));
+          MRI->constrainRegClass(NewReg, MRI->getRegClass(Use->getReg()));
 
-          LLVM_DEBUG(dbgs() << "Replacing operand " << **I << " with "
+          LLVM_DEBUG(dbgs() << "Replacing operand " << *Use << " with "
                             << printReg(NewReg) << "\n");
-          (*I)->substVirtReg(NewReg, 0, *TRI);
+          Use->substVirtReg(NewReg, 0, *TRI);
         }
       }
       Replacements[MI] = NewReg;
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index e03dd597eb65..8173fe4036a8 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -446,6 +446,11 @@ def FeaturePACBTI         : SubtargetFeature<"pacbti", "HasPACBTI", "true",
                                              "Enable Pointer Authentication and Branch "
                                              "Target Identification">;
 
+def FeatureNoBTIAtReturnTwice : SubtargetFeature<"no-bti-at-return-twice",
+                                                 "NoBTIAtReturnTwice", "true",
+                                                 "Don't place a BTI instruction "
+                                                 "after a return-twice">;
+
 //===----------------------------------------------------------------------===//
 // ARM architecture class
 //
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 6a88ac485e69..fa09b2567aa9 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1153,8 +1153,12 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
     unsigned StartOp = 2 + 2;
     // Use all the operands.
     unsigned NumOffset = 0;
-    // Amount of SP adjustment folded into a push.
-    unsigned Pad = 0;
+    // Amount of SP adjustment folded into a push, before the
+    // registers are stored (pad at higher addresses).
+    unsigned PadBefore = 0;
+    // Amount of SP adjustment folded into a push, after the
+    // registers are stored (pad at lower addresses).
+    unsigned PadAfter = 0;
 
     switch (Opc) {
     default:
@@ -1185,7 +1189,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
                  "Pad registers must come before restored ones");
           unsigned Width =
             TargetRegInfo->getRegSizeInBits(MO.getReg(), MachineRegInfo) / 8;
-          Pad += Width;
+          PadAfter += Width;
           continue;
         }
         // Check for registers that are remapped (for a Thumb1 prologue that
@@ -1201,14 +1205,32 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
     case ARM::t2STR_PRE:
       assert(MI->getOperand(2).getReg() == ARM::SP &&
              "Only stack pointer as a source reg is supported");
+      if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(SrcReg))
+        SrcReg = RemappedReg;
+
+      RegList.push_back(SrcReg);
+      break;
+    case ARM::t2STRD_PRE:
+      assert(MI->getOperand(3).getReg() == ARM::SP &&
+             "Only stack pointer as a source reg is supported");
+      SrcReg = MI->getOperand(1).getReg();
+      if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(SrcReg))
+        SrcReg = RemappedReg;
+      RegList.push_back(SrcReg);
+      SrcReg = MI->getOperand(2).getReg();
+      if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(SrcReg))
+        SrcReg = RemappedReg;
       RegList.push_back(SrcReg);
+      PadBefore = -MI->getOperand(4).getImm() - 8;
       break;
     }
     if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) {
+      if (PadBefore)
+        ATS.emitPad(PadBefore);
       ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
       // Account for the SP adjustment, folded into the push.
-      if (Pad)
-        ATS.emitPad(Pad);
+      if (PadAfter)
+        ATS.emitPad(PadAfter);
     }
   } else {
     // Changes of stack / frame pointer.
@@ -1300,6 +1322,10 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
         Offset = MI->getOperand(2).getImm();
         AFI->EHPrologueOffsetInRegs[DstReg] |= (Offset << 16);
         break;
+      case ARM::t2PAC:
+      case ARM::t2PACBTI:
+        AFI->EHPrologueRemappedRegs[ARM::R12] = ARM::RA_AUTH_CODE;
+        break;
       default:
         MI->print(errs());
         llvm_unreachable("Unsupported opcode for unwinding information");
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 2a12947d24a8..884f38ff6c58 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2629,8 +2629,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
 
   // Add the complete list back in.
   MachineInstrBuilder MIB(MF, &*MI);
-  for (int i = RegList.size() - 1; i >= 0; --i)
-    MIB.add(RegList[i]);
+  for (const MachineOperand &MO : llvm::reverse(RegList))
+    MIB.add(MO);
 
   return true;
 }
@@ -5678,7 +5678,7 @@ bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
 /// |                         | Thumb2 | ARM |
 /// +-------------------------+--------+-----+
 /// | Call overhead in Bytes  |      4 |   4 |
-/// | Frame overhead in Bytes |      4 |   4 |
+/// | Frame overhead in Bytes |      2 |   4 |
 /// | Stack fixup required    |     No |  No |
 /// +-------------------------+--------+-----+
 ///
@@ -5755,7 +5755,7 @@ struct OutlinerCosts {
         CallThunk(target.isThumb() ? 4 : 4),
         FrameThunk(target.isThumb() ? 0 : 0),
         CallNoLRSave(target.isThumb() ? 4 : 4),
-        FrameNoLRSave(target.isThumb() ? 4 : 4),
+        FrameNoLRSave(target.isThumb() ? 2 : 4),
         CallRegSave(target.isThumb() ? 8 : 12),
         FrameRegSave(target.isThumb() ? 2 : 4),
         CallDefault(target.isThumb() ? 8 : 12),
@@ -5868,11 +5868,17 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
       return outliner::OutlinedFunction();
   }
 
+  // We expect the majority of the outlining candidates to be in consensus with
+  // regard to return address sign and authentication, and branch target
+  // enforcement, in other words, partitioning according to all the four
+  // possible combinations of PAC-RET and BTI is going to yield one big subset
+  // and three small (likely empty) subsets. That allows us to cull incompatible
+  // candidates separately for PAC-RET and BTI.
+
   // Partition the candidates in two sets: one with BTI enabled and one with BTI
-  // disabled. Remove the candidates from the smaller set. We expect the
-  // majority of the candidates to be in consensus with regard to branch target
-  // enforcement with just a few oddballs, but if they are the same number
-  // prefer the non-BTI ones for outlining, since they have less overhead.
+  // disabled. Remove the candidates from the smaller set. If they are the same
+  // number prefer the non-BTI ones for outlining, since they have less
+  // overhead.
   auto NoBTI =
       llvm::partition(RepeatedSequenceLocs, [](const outliner::Candidate &C) {
         const ARMFunctionInfo &AFI = *C.getMF()->getInfo<ARMFunctionInfo>();
@@ -5883,6 +5889,24 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
     RepeatedSequenceLocs.erase(NoBTI, RepeatedSequenceLocs.end());
   else
     RepeatedSequenceLocs.erase(RepeatedSequenceLocs.begin(), NoBTI);
+
+  if (RepeatedSequenceLocs.size() < 2)
+    return outliner::OutlinedFunction();
+
+  // Likewise, partition the candidates according to PAC-RET enablement.
+  auto NoPAC =
+      llvm::partition(RepeatedSequenceLocs, [](const outliner::Candidate &C) {
+        const ARMFunctionInfo &AFI = *C.getMF()->getInfo<ARMFunctionInfo>();
+        // If the function happens to not spill the LR, do not disqualify it
+        // from the outlining.
+        return AFI.shouldSignReturnAddress(true);
+      });
+  if (std::distance(RepeatedSequenceLocs.begin(), NoPAC) >
+      std::distance(NoPAC, RepeatedSequenceLocs.end()))
+    RepeatedSequenceLocs.erase(NoPAC, RepeatedSequenceLocs.end());
+  else
+    RepeatedSequenceLocs.erase(RepeatedSequenceLocs.begin(), NoPAC);
+
   if (RepeatedSequenceLocs.size() < 2)
     return outliner::OutlinedFunction();
 
@@ -5899,6 +5923,7 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
       };
 
   OutlinerCosts Costs(Subtarget);
+
   const auto &SomeMFI =
       *RepeatedSequenceLocs.front().getMF()->getInfo<ARMFunctionInfo>();
   // Adjust costs to account for the BTI instructions.
@@ -5909,6 +5934,13 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
     Costs.FrameTailCall += 4;
     Costs.FrameThunk += 4;
   }
+
+  // Adjust costs to account for sign and authentication instructions.
+  if (SomeMFI.shouldSignReturnAddress(true)) {
+    Costs.CallDefault += 8;          // +PAC instr, +AUT instr
+    Costs.SaveRestoreLROnStack += 8; // +PAC instr, +AUT instr
+  }
+
   unsigned FrameID = MachineOutlinerDefault;
   unsigned NumBytesToCreateFrame = Costs.FrameDefault;
 
@@ -6325,6 +6357,11 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
     // * LR is available in the range (No save/restore around call)
     // * The range doesn't include calls (No save/restore in outlined frame)
     // are true.
+    // These conditions also ensure correctness of the return address
+    // authentication - we insert sign and authentication instructions only if
+    // we save/restore LR on stack, but then this condition ensures that the
+    // outlined range does not modify the SP, therefore the SP value used for
+    // signing is the same as the one used for authentication.
     // FIXME: This is very restrictive; the flags check the whole block,
     // not just the bit we will try to outline.
     bool MightNeedStackFixUp =
@@ -6369,23 +6406,39 @@ void ARMBaseInstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
 }
 
 void ARMBaseInstrInfo::saveLROnStack(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator It) const {
-  unsigned Opc = Subtarget.isThumb() ? ARM::t2STR_PRE : ARM::STR_PRE_IMM;
-  int Align = -Subtarget.getStackAlignment().value();
-  BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::SP)
-    .addReg(ARM::LR, RegState::Kill)
-    .addReg(ARM::SP)
-    .addImm(Align)
-    .add(predOps(ARMCC::AL));
-}
+                                     MachineBasicBlock::iterator It, bool CFI,
+                                     bool Auth) const {
+  int Align = std::max(Subtarget.getStackAlignment().value(), uint64_t(8));
+  assert(Align >= 8 && Align <= 256);
+  if (Auth) {
+    assert(Subtarget.isThumb2());
+    // Compute PAC in R12. Outlining ensures R12 is dead across the outlined
+    // sequence.
+    BuildMI(MBB, It, DebugLoc(), get(ARM::t2PAC))
+        .setMIFlags(MachineInstr::FrameSetup);
+    BuildMI(MBB, It, DebugLoc(), get(ARM::t2STRD_PRE), ARM::SP)
+        .addReg(ARM::R12, RegState::Kill)
+        .addReg(ARM::LR, RegState::Kill)
+        .addReg(ARM::SP)
+        .addImm(-Align)
+        .add(predOps(ARMCC::AL))
+        .setMIFlags(MachineInstr::FrameSetup);
+  } else {
+    unsigned Opc = Subtarget.isThumb() ? ARM::t2STR_PRE : ARM::STR_PRE_IMM;
+    BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::SP)
+        .addReg(ARM::LR, RegState::Kill)
+        .addReg(ARM::SP)
+        .addImm(-Align)
+        .add(predOps(ARMCC::AL))
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+
+  if (!CFI)
+    return;
 
-void ARMBaseInstrInfo::emitCFIForLRSaveOnStack(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
   MachineFunction &MF = *MBB.getParent();
-  const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
-  unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
-  int Align = Subtarget.getStackAlignment().value();
-  // Add a CFI saying the stack was moved down.
+
+  // Add a CFI, saying CFA is offset by Align bytes from SP.
   int64_t StackPosEntry =
       MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Align));
   BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
@@ -6394,11 +6447,23 @@ void ARMBaseInstrInfo::emitCFIForLRSaveOnStack(
 
   // Add a CFI saying that the LR that we want to find is now higher than
   // before.
-  int64_t LRPosEntry =
-      MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfLR, -Align));
+  int LROffset = Auth ? Align - 4 : Align;
+  const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
+  unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
+  int64_t LRPosEntry = MF.addFrameInst(
+      MCCFIInstruction::createOffset(nullptr, DwarfLR, -LROffset));
   BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
       .addCFIIndex(LRPosEntry)
       .setMIFlags(MachineInstr::FrameSetup);
+  if (Auth) {
+    // Add a CFI for the location of the return adddress PAC.
+    unsigned DwarfRAC = MRI->getDwarfRegNum(ARM::RA_AUTH_CODE, true);
+    int64_t RACPosEntry = MF.addFrameInst(
+        MCCFIInstruction::createOffset(nullptr, DwarfRAC, -Align));
+    BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+        .addCFIIndex(RACPosEntry)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
 }
 
 void ARMBaseInstrInfo::emitCFIForLRSaveToReg(MachineBasicBlock &MBB,
@@ -6416,35 +6481,64 @@ void ARMBaseInstrInfo::emitCFIForLRSaveToReg(MachineBasicBlock &MBB,
       .setMIFlags(MachineInstr::FrameSetup);
 }
 
-void ARMBaseInstrInfo::restoreLRFromStack(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
-  unsigned Opc = Subtarget.isThumb() ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
-  MachineInstrBuilder MIB = BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::LR)
-    .addReg(ARM::SP, RegState::Define)
-    .addReg(ARM::SP);
-  if (!Subtarget.isThumb())
-    MIB.addReg(0);
-  MIB.addImm(Subtarget.getStackAlignment().value()).add(predOps(ARMCC::AL));
-}
+void ARMBaseInstrInfo::restoreLRFromStack(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator It,
+                                          bool CFI, bool Auth) const {
+  int Align = Subtarget.getStackAlignment().value();
+  if (Auth) {
+    assert(Subtarget.isThumb2());
+    // Restore return address PAC and LR.
+    BuildMI(MBB, It, DebugLoc(), get(ARM::t2LDRD_POST))
+        .addReg(ARM::R12, RegState::Define)
+        .addReg(ARM::LR, RegState::Define)
+        .addReg(ARM::SP, RegState::Define)
+        .addReg(ARM::SP)
+        .addImm(Align)
+        .add(predOps(ARMCC::AL))
+        .setMIFlags(MachineInstr::FrameDestroy);
+    // LR authentication is after the CFI instructions, below.
+  } else {
+    unsigned Opc = Subtarget.isThumb() ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
+    MachineInstrBuilder MIB = BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::LR)
+                                  .addReg(ARM::SP, RegState::Define)
+                                  .addReg(ARM::SP);
+    if (!Subtarget.isThumb())
+      MIB.addReg(0);
+    MIB.addImm(Subtarget.getStackAlignment().value())
+        .add(predOps(ARMCC::AL))
+        .setMIFlags(MachineInstr::FrameDestroy);
+  }
 
-void ARMBaseInstrInfo::emitCFIForLRRestoreFromStack(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
-  // Now stack has moved back up...
-  MachineFunction &MF = *MBB.getParent();
-  const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
-  unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
-  int64_t StackPosEntry =
-      MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
-  BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
-      .addCFIIndex(StackPosEntry)
-      .setMIFlags(MachineInstr::FrameDestroy);
+  if (CFI) {
+    // Now stack has moved back up...
+    MachineFunction &MF = *MBB.getParent();
+    const MCRegisterInfo *MRI = Subtarget.getRegisterInfo();
+    unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true);
+    int64_t StackPosEntry =
+        MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
+    BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+        .addCFIIndex(StackPosEntry)
+        .setMIFlags(MachineInstr::FrameDestroy);
 
-  // ... and we have restored LR.
-  int64_t LRPosEntry =
-      MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR));
-  BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
-      .addCFIIndex(LRPosEntry)
-      .setMIFlags(MachineInstr::FrameDestroy);
+    // ... and we have restored LR.
+    int64_t LRPosEntry =
+        MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR));
+    BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+        .addCFIIndex(LRPosEntry)
+        .setMIFlags(MachineInstr::FrameDestroy);
+
+    if (Auth) {
+      unsigned DwarfRAC = MRI->getDwarfRegNum(ARM::RA_AUTH_CODE, true);
+      int64_t Entry =
+          MF.addFrameInst(MCCFIInstruction::createUndefined(nullptr, DwarfRAC));
+      BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION))
+          .addCFIIndex(Entry)
+          .setMIFlags(MachineInstr::FrameDestroy);
+    }
+  }
+
+  if (Auth)
+    BuildMI(MBB, It, DebugLoc(), get(ARM::t2AUT));
 }
 
 void ARMBaseInstrInfo::emitCFIForLRRestoreFromReg(
@@ -6500,8 +6594,11 @@ void ARMBaseInstrInfo::buildOutlinedFrame(
       MBB.addLiveIn(ARM::LR);
 
     // Insert a save before the outlined region
-    saveLROnStack(MBB, It);
-    emitCFIForLRSaveOnStack(MBB, It);
+    bool Auth = OF.Candidates.front()
+                    .getMF()
+                    ->getInfo<ARMFunctionInfo>()
+                    ->shouldSignReturnAddress(true);
+    saveLROnStack(MBB, It, true, Auth);
 
     // Fix up the instructions in the range, since we're going to modify the
     // stack.
@@ -6510,8 +6607,7 @@ void ARMBaseInstrInfo::buildOutlinedFrame(
     fixupPostOutline(MBB);
 
     // Insert a restore before the terminator for the function.  Restore LR.
-    restoreLRFromStack(MBB, Et);
-    emitCFIForLRRestoreFromStack(MBB, Et);
+    restoreLRFromStack(MBB, Et, true, Auth);
   }
 
   // If this is a tail call outlined function, then there's already a return.
@@ -6590,13 +6686,10 @@ MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall(
   // We have the default case. Save and restore from SP.
   if (!MBB.isLiveIn(ARM::LR))
     MBB.addLiveIn(ARM::LR);
-  saveLROnStack(MBB, It);
-  if (!AFI.isLRSpilled())
-    emitCFIForLRSaveOnStack(MBB, It);
+  bool Auth = !AFI.isLRSpilled() && AFI.shouldSignReturnAddress(true);
+  saveLROnStack(MBB, It, !AFI.isLRSpilled(), Auth);
   CallPt = MBB.insert(It, CallMIB);
-  restoreLRFromStack(MBB, It);
-  if (!AFI.isLRSpilled())
-    emitCFIForLRRestoreFromStack(MBB, It);
+  restoreLRFromStack(MBB, It, !AFI.isLRSpilled(), Auth);
   It--;
   return CallPt;
 }
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 5fa912ae35d7..defce07dd862 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -377,20 +377,20 @@ private:
   /// constructing an outlined call if one exists. Returns 0 otherwise.
   unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
 
-  // Adds an instruction which saves the link register on top of the stack into
-  /// the MachineBasicBlock \p MBB at position \p It.
-  void saveLROnStack(MachineBasicBlock &MBB,
-                     MachineBasicBlock::iterator It) const;
+  /// Adds an instruction which saves the link register on top of the stack into
+  /// the MachineBasicBlock \p MBB at position \p It. If \p Auth is true,
+  /// compute and store an authentication code alongiside the link register.
+  /// If \p CFI is true, emit CFI instructions.
+  void saveLROnStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator It,
+                     bool CFI, bool Auth) const;
 
   /// Adds an instruction which restores the link register from the top the
-  /// stack into the MachineBasicBlock \p MBB at position \p It.
+  /// stack into the MachineBasicBlock \p MBB at position \p It. If \p Auth is
+  /// true, restore an authentication code and authenticate LR.
+  /// If \p CFI is true, emit CFI instructions.
   void restoreLRFromStack(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator It) const;
-
-  /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
-  /// for the case when the LR is saved on the stack.
-  void emitCFIForLRSaveOnStack(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator It) const;
+                          MachineBasicBlock::iterator It, bool CFI,
+                          bool Auth) const;
 
   /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
   /// for the case when the LR is saved in the register \p Reg.
@@ -399,11 +399,6 @@ private:
                              Register Reg) const;
 
   /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
-  /// after the LR is was restored from the stack.
-  void emitCFIForLRRestoreFromStack(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator It) const;
-
-  /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It,
   /// after the LR is was restored from a register.
   void emitCFIForLRRestoreFromReg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator It) const;
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index b53efe58e8de..c543d02ff75a 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -530,6 +530,8 @@ getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const {
   unsigned ImmIdx = 0;
   switch (AddrMode) {
   case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i8neg:
+  case ARMII::AddrModeT2_i8pos:
   case ARMII::AddrModeT2_i12:
   case ARMII::AddrMode_i12:
     InstrOffs = MI->getOperand(Idx+1).getImm();
@@ -728,6 +730,8 @@ bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
   bool isSigned = true;
   switch (AddrMode) {
   case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i8pos:
+  case ARMII::AddrModeT2_i8neg:
   case ARMII::AddrModeT2_i12:
     // i8 supports only negative, and i12 supports only positive, so
     // based on Offset sign, consider the appropriate instruction
diff --git a/llvm/lib/Target/ARM/ARMBranchTargets.cpp b/llvm/lib/Target/ARM/ARMBranchTargets.cpp
index 1091c1f970fa..8ba3e627c039 100644
--- a/llvm/lib/Target/ARM/ARMBranchTargets.cpp
+++ b/llvm/lib/Target/ARM/ARMBranchTargets.cpp
@@ -108,6 +108,7 @@ void ARMBranchTargets::addBTI(const ARMInstrInfo &TII, MachineBasicBlock &MBB,
                               bool IsFirstBB) {
   // Which instruction to insert: BTI or PACBTI
   unsigned OpCode = ARM::t2BTI;
+  unsigned MIFlags = 0;
 
   // Skip meta instructions, including EH labels
   auto MBBI = llvm::find_if_not(MBB.instrs(), [](const MachineInstr &MI) {
@@ -121,6 +122,7 @@ void ARMBranchTargets::addBTI(const ARMInstrInfo &TII, MachineBasicBlock &MBB,
       LLVM_DEBUG(dbgs() << "Removing a 'PAC' instr from BB '" << MBB.getName()
                         << "' to replace with PACBTI\n");
       OpCode = ARM::t2PACBTI;
+      MIFlags = MachineInstr::FrameSetup;
       auto NextMBBI = std::next(MBBI);
       MBBI->eraseFromParent();
       MBBI = NextMBBI;
@@ -131,5 +133,6 @@ void ARMBranchTargets::addBTI(const ARMInstrInfo &TII, MachineBasicBlock &MBB,
                     << (OpCode == ARM::t2BTI ? "BTI" : "PACBTI")
                     << "' instr into BB '" << MBB.getName() << "'\n");
   // Finally, insert a new instruction (either PAC or PACBTI)
-  BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII.get(OpCode));
+  BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII.get(OpCode))
+      .setMIFlags(MIFlags);
 }
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.cpp b/llvm/lib/Target/ARM/ARMCallingConv.cpp
index d8d9ca3b912f..32f3a4a632f5 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.cpp
+++ b/llvm/lib/Target/ARM/ARMCallingConv.cpp
@@ -230,10 +230,9 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
 
   unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
   if (RegResult) {
-    for (SmallVectorImpl<CCValAssign>::iterator It = PendingMembers.begin();
-         It != PendingMembers.end(); ++It) {
-      It->convertToReg(RegResult);
-      State.addLoc(*It);
+    for (CCValAssign &PendingMember : PendingMembers) {
+      PendingMember.convertToReg(RegResult);
+      State.addLoc(PendingMember);
       ++RegResult;
     }
     PendingMembers.clear();
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index c2ca4708c208..a2a4f1f3bdfd 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -310,8 +310,7 @@ void ARMConstantIslands::verify() {
            BBInfo[RHS.getNumber()].postOffset();
   }));
   LLVM_DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
-  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
-    CPUser &U = CPUsers[i];
+  for (CPUser &U : CPUsers) {
     unsigned UserOffset = getUserOffset(U);
     // Verify offset using the real max displacement without the safety
     // adjustment.
@@ -697,10 +696,9 @@ ARMConstantIslands::findConstPoolEntry(unsigned CPI,
   std::vector<CPEntry> &CPEs = CPEntries[CPI];
   // Number of entries per constpool index should be small, just do a
   // linear search.
-  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
-    if (CPEs[i].CPEMI == CPEMI)
-      return &CPEs[i];
-  }
+  for (CPEntry &CPE : CPEs)
+    if (CPE.CPEMI == CPEMI)
+      return &CPE;
   return nullptr;
 }
 
@@ -1234,27 +1232,27 @@ int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset) {
   // No.  Look for previously created clones of the CPE that are in range.
   unsigned CPI = getCombinedIndex(CPEMI);
   std::vector<CPEntry> &CPEs = CPEntries[CPI];
-  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+  for (CPEntry &CPE : CPEs) {
     // We already tried this one
-    if (CPEs[i].CPEMI == CPEMI)
+    if (CPE.CPEMI == CPEMI)
       continue;
     // Removing CPEs can leave empty entries, skip
-    if (CPEs[i].CPEMI == nullptr)
+    if (CPE.CPEMI == nullptr)
       continue;
-    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
-                     U.NegOk)) {
-      LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
-                        << CPEs[i].CPI << "\n");
+    if (isCPEntryInRange(UserMI, UserOffset, CPE.CPEMI, U.getMaxDisp(),
+                         U.NegOk)) {
+      LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#" << CPE.CPI
+                        << "\n");
       // Point the CPUser node to the replacement
-      U.CPEMI = CPEs[i].CPEMI;
+      U.CPEMI = CPE.CPEMI;
       // Change the CPI in the instruction operand to refer to the clone.
       for (MachineOperand &MO : UserMI->operands())
         if (MO.isCPI()) {
-          MO.setIndex(CPEs[i].CPI);
+          MO.setIndex(CPE.CPI);
           break;
         }
       // Adjust the refcount of the clone...
-      CPEs[i].RefCount++;
+      CPE.RefCount++;
       // ...and the original.  If we didn't remove the old entry, none of the
       // addresses changed, so we don't need another pass.
       return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
@@ -1675,15 +1673,14 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
 /// are zero.
 bool ARMConstantIslands::removeUnusedCPEntries() {
   unsigned MadeChange = false;
-  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
-      std::vector<CPEntry> &CPEs = CPEntries[i];
-      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
-        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
-          removeDeadCPEMI(CPEs[j].CPEMI);
-          CPEs[j].CPEMI = nullptr;
-          MadeChange = true;
-        }
+  for (std::vector<CPEntry> &CPEs : CPEntries) {
+    for (CPEntry &CPE : CPEs) {
+      if (CPE.RefCount == 0 && CPE.CPEMI) {
+        removeDeadCPEMI(CPE.CPEMI);
+        CPE.CPEMI = nullptr;
+        MadeChange = true;
       }
+    }
   }
   return MadeChange;
 }
@@ -1829,8 +1826,7 @@ bool ARMConstantIslands::optimizeThumb2Instructions() {
   bool MadeChange = false;
 
   // Shrink ADR and LDR from constantpool.
-  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
-    CPUser &U = CPUsers[i];
+  for (CPUser &U : CPUsers) {
     unsigned Opcode = U.MI->getOpcode();
     unsigned NewOpc = 0;
     unsigned Scale = 1;
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 7a35f252b22a..fa244786a80d 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2160,6 +2160,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       return true;
     }
     case ARM::tBXNS_RET: {
+      // For v8.0-M.Main we need to authenticate LR before clearing FPRs, which
+      // uses R12 as a scratch register.
+      if (!STI->hasV8_1MMainlineOps() && AFI->shouldSignReturnAddress())
+        BuildMI(MBB, MBBI, DebugLoc(), TII->get(ARM::t2AUT));
+
       MachineBasicBlock &AfterBB = CMSEClearFPRegs(MBB, MBBI);
 
       if (STI->hasV8_1MMainlineOps()) {
@@ -2169,6 +2174,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
             .addReg(ARM::SP)
             .addImm(4)
             .add(predOps(ARMCC::AL));
+
+        if (AFI->shouldSignReturnAddress())
+          BuildMI(AfterBB, AfterBB.end(), DebugLoc(), TII->get(ARM::t2AUT));
       }
 
       // Clear all GPR that are not a use of the return instruction.
@@ -3073,6 +3081,22 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MI.eraseFromParent();
       return true;
     }
+    case ARM::t2CALL_BTI: {
+      MachineFunction &MF = *MI.getMF();
+      MachineInstrBuilder MIB =
+          BuildMI(MF, MI.getDebugLoc(), TII->get(ARM::tBL));
+      MIB.cloneMemRefs(MI);
+      for (unsigned i = 0; i < MI.getNumOperands(); ++i)
+        MIB.add(MI.getOperand(i));
+      if (MI.isCandidateForCallSiteEntry())
+        MF.moveCallSiteInfo(&MI, MIB.getInstr());
+      MIBundleBuilder Bundler(MBB, MI);
+      Bundler.append(MIB);
+      Bundler.append(BuildMI(MF, MI.getDebugLoc(), TII->get(ARM::t2BTI)));
+      finalizeBundle(MBB, Bundler.begin(), Bundler.end());
+      MI.eraseFromParent();
+      return true;
+    }
     case ARM::LOADDUAL:
     case ARM::STOREDUAL: {
       Register PairReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index b866cf952ff1..4b59f9cb94ce 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -503,20 +503,12 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   StackAdjustingInsts DefCFAOffsetCandidates;
   bool HasFP = hasFP(MF);
 
-  // Allocate the vararg register save area.
-  if (ArgRegsSaveSize) {
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize,
-                 MachineInstr::FrameSetup);
-    DefCFAOffsetCandidates.addInst(std::prev(MBBI), ArgRegsSaveSize, true);
-  }
-
   if (!AFI->hasStackFrame() &&
       (!STI.isTargetWindows() || !WindowsRequiresStackProbe(MF, NumBytes))) {
-    if (NumBytes - ArgRegsSaveSize != 0) {
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -(NumBytes - ArgRegsSaveSize),
+    if (NumBytes != 0) {
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
                    MachineInstr::FrameSetup);
-      DefCFAOffsetCandidates.addInst(std::prev(MBBI),
-                                     NumBytes - ArgRegsSaveSize, true);
+      DefCFAOffsetCandidates.addInst(std::prev(MBBI), NumBytes, true);
     }
     DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP);
     return;
@@ -562,13 +554,26 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     }
   }
 
-  // Move past FPCXT area.
   MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push;
+
+  // Move past the PAC computation.
+  if (AFI->shouldSignReturnAddress())
+    LastPush = MBBI++;
+
+  // Move past FPCXT area.
   if (FPCXTSaveSize > 0) {
     LastPush = MBBI++;
     DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, true);
   }
 
+  // Allocate the vararg register save area.
+  if (ArgRegsSaveSize) {
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize,
+                 MachineInstr::FrameSetup);
+    LastPush = std::prev(MBBI);
+    DefCFAOffsetCandidates.addInst(LastPush, ArgRegsSaveSize, true);
+  }
+
   // Move past area 1.
   if (GPRCS1Size > 0) {
     GPRCS1Push = LastPush = MBBI++;
@@ -788,7 +793,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       case ARM::R11:
       case ARM::R12:
         if (STI.splitFramePushPop(MF)) {
-          unsigned DwarfReg =  MRI->getDwarfRegNum(Reg, true);
+          unsigned DwarfReg = MRI->getDwarfRegNum(
+              Reg == ARM::R12 ? (unsigned)ARM::RA_AUTH_CODE : Reg, true);
           unsigned Offset = MFI.getObjectOffset(FI);
           unsigned CFIIndex = MF.addFrameInst(
               MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
@@ -923,8 +929,9 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   if (!AFI->hasStackFrame()) {
-    if (NumBytes - ReservedArgStack != 0)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ReservedArgStack,
+    if (NumBytes + IncomingArgStackToRestore != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII,
+                   NumBytes + IncomingArgStackToRestore,
                    MachineInstr::FrameDestroy);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
@@ -1007,15 +1014,21 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
 
     if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
     if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
-    if (AFI->getFPCXTSaveAreaSize()) MBBI++;
-  }
 
-  if (ReservedArgStack || IncomingArgStackToRestore) {
-    assert((int)ReservedArgStack + IncomingArgStackToRestore >= 0 &&
-           "attempting to restore negative stack amount");
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII,
-                 ReservedArgStack + IncomingArgStackToRestore,
-                 MachineInstr::FrameDestroy);
+    if (ReservedArgStack || IncomingArgStackToRestore) {
+      assert((int)ReservedArgStack + IncomingArgStackToRestore >= 0 &&
+             "attempting to restore negative stack amount");
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII,
+                   ReservedArgStack + IncomingArgStackToRestore,
+                   MachineInstr::FrameDestroy);
+    }
+
+    // Validate PAC, It should have been already popped into R12. For CMSE entry
+    // function, the validation instruction is emitted during expansion of the
+    // tBXNS_RET, since the validation must use the value of SP at function
+    // entry, before saving, resp. after restoring, FPCXTNS.
+    if (AFI->shouldSignReturnAddress() && !AFI->isCmseNSEntryFunction())
+      BuildMI(MBB, MBBI, DebugLoc(), STI.getInstrInfo()->get(ARM::t2AUT));
   }
 }
 
@@ -1199,6 +1212,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool hasPAC = AFI->shouldSignReturnAddress();
   DebugLoc DL;
   bool isTailCall = false;
   bool isInterrupt = false;
@@ -1231,7 +1245,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
         continue;
       if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
           !isCmseEntry && !isTrap && AFI->getArgumentStackToRestore() == 0 &&
-          STI.hasV5TOps() && MBB.succ_empty()) {
+          STI.hasV5TOps() && MBB.succ_empty() && !hasPAC) {
         Reg = ARM::PC;
         // Fold the return instruction into the LDM.
         DeleteRet = true;
@@ -1580,6 +1594,11 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(
     ARM::t2STR_PRE : ARM::STR_PRE_IMM;
   unsigned FltOpc = ARM::VSTMDDB_UPD;
   unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs();
+  // Compute PAC in R12.
+  if (AFI->shouldSignReturnAddress()) {
+    BuildMI(MBB, MI, DebugLoc(), STI.getInstrInfo()->get(ARM::t2PAC))
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
   // Save the non-secure floating point context.
   if (llvm::any_of(CSI, [](const CalleeSavedInfo &C) {
         return C.getReg() == ARM::FPCXTNS;
@@ -1789,6 +1808,13 @@ bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
       MF.getInfo<ARMFunctionInfo>()->isCmseNSEntryFunction())
     return false;
 
+  // We are disabling shrinkwrapping for now when PAC is enabled, as
+  // shrinkwrapping can cause clobbering of r12 when the PAC code is
+  // generated. A follow-up patch will fix this in a more performant manner.
+  if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(
+          false /*SpillsLR */))
+    return false;
+
   return true;
 }
 
@@ -2315,6 +2341,26 @@ bool ARMFrameLowering::assignCalleeSavedSpillSlots(
     CSI.back().setRestored(false);
   }
 
+  // For functions, which sign their return address, upon function entry, the
+  // return address PAC is computed in R12. Treat R12 as a callee-saved register
+  // in this case.
+  const auto &AFI = *MF.getInfo<ARMFunctionInfo>();
+  if (AFI.shouldSignReturnAddress()) {
+    // The order of register must match the order we push them, because the
+    // PEI assigns frame indices in that order. When compiling for return
+    // address sign and authenication, we use split push, therefore the orders
+    // we want are:
+    // LR, R7, R6, R5, R4, <R12>, R11, R10,  R9,  R8, D15-D8
+    CSI.insert(find_if(CSI,
+                       [=](const auto &CS) {
+                         unsigned Reg = CS.getReg();
+                         return Reg == ARM::R10 || Reg == ARM::R11 ||
+                                Reg == ARM::R8 || Reg == ARM::R9 ||
+                                ARM::DPRRegClass.contains(Reg);
+                       }),
+               CalleeSavedInfo(ARM::R12));
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 33d115945614..3d45db349644 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -391,6 +391,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+    setOperationAction(ISD::VSELECT, VT, Legal);
   }
   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
 
@@ -428,7 +429,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
   }
 
   // Predicate types
-  const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1};
+  const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
   for (auto VT : pTypes) {
     addRegisterClass(VT, &ARM::VCCRRegClass);
     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
@@ -445,6 +446,16 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
   }
+  setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
+  setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand);
+  setOperationAction(ISD::AND, MVT::v2i1, Expand);
+  setOperationAction(ISD::OR, MVT::v2i1, Expand);
+  setOperationAction(ISD::XOR, MVT::v2i1, Expand);
+  setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Expand);
+
   setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
   setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
   setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
@@ -1647,6 +1658,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(ARMISD::CALL_PRED)
     MAKE_CASE(ARMISD::CALL_NOLINK)
     MAKE_CASE(ARMISD::tSECALL)
+    MAKE_CASE(ARMISD::t2CALL_BTI)
     MAKE_CASE(ARMISD::BRCOND)
     MAKE_CASE(ARMISD::BR_JT)
     MAKE_CASE(ARMISD::BR2_JT)
@@ -1853,8 +1865,10 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
 
   // MVE has a predicate register.
   if ((Subtarget->hasMVEIntegerOps() &&
-       (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) ||
-      (Subtarget->hasMVEFloatOps() && (VT == MVT::v4f32 || VT == MVT::v8f16)))
+       (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
+        VT == MVT::v16i8)) ||
+      (Subtarget->hasMVEFloatOps() &&
+       (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
     return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
   return VT.changeVectorElementTypeToInteger();
 }
@@ -2308,6 +2322,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isCmseNSCall   = false;
   bool isSibCall = false;
   bool PreferIndirect = false;
+  bool GuardWithBTI = false;
+
+  // Lower 'returns_twice' calls to a pseudo-instruction.
+  if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
+      !Subtarget->getNoBTIAtReturnTwice())
+    GuardWithBTI = AFI->branchTargetEnforcement();
 
   // Determine whether this is a non-secure function call.
   if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
@@ -2713,7 +2733,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // FIXME: handle tail calls differently.
   unsigned CallOpc;
   if (Subtarget->isThumb()) {
-    if (isCmseNSCall)
+    if (GuardWithBTI)
+      CallOpc = ARMISD::t2CALL_BTI;
+    else if (isCmseNSCall)
       CallOpc = ARMISD::tSECALL;
     else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
@@ -2930,9 +2952,17 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
   // Indirect tail calls cannot be optimized for Thumb1 if the args
   // to the call take up r0-r3. The reason is that there are no legal registers
   // left to hold the pointer to the function to be called.
-  if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
-      (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect))
-    return false;
+  // Similarly, if the function uses return address sign and authentication,
+  // r12 is needed to hold the PAC and is not available to hold the callee
+  // address.
+  if (Outs.size() >= 4 &&
+      (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
+    if (Subtarget->isThumb1Only())
+      return false;
+    // Conservatively assume the function spills LR.
+    if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true))
+      return false;
+  }
 
   // Look for obvious safe cases to perform tail call optimization that do not
   // require ABI changes. This is what gcc calls sibcall.
@@ -7616,7 +7646,10 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
   unsigned NumElts = VT.getVectorNumElements();
   unsigned BoolMask;
   unsigned BitsPerBool;
-  if (NumElts == 4) {
+  if (NumElts == 2) {
+    BitsPerBool = 8;
+    BoolMask = 0xff;
+  } else if (NumElts == 4) {
     BitsPerBool = 4;
     BoolMask = 0xf;
   } else if (NumElts == 8) {
@@ -7699,6 +7732,46 @@ static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG,
                      DAG.getConstant(N, DL, MVT::i32));
 }
 
+// Returns true if the operation N can be treated as qr instruction variant at
+// operand Op.
+static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
+  switch (N->getOpcode()) {
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+    return true;
+  case ISD::SUB:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+    return N->getOperand(1).getNode() == Op;
+  case ISD::INTRINSIC_WO_CHAIN:
+    switch (N->getConstantOperandVal(0)) {
+    case Intrinsic::arm_mve_add_predicated:
+    case Intrinsic::arm_mve_mul_predicated:
+    case Intrinsic::arm_mve_qadd_predicated:
+    case Intrinsic::arm_mve_vhadd:
+    case Intrinsic::arm_mve_hadd_predicated:
+    case Intrinsic::arm_mve_vqdmulh:
+    case Intrinsic::arm_mve_qdmulh_predicated:
+    case Intrinsic::arm_mve_vqrdmulh:
+    case Intrinsic::arm_mve_qrdmulh_predicated:
+    case Intrinsic::arm_mve_vqdmull:
+    case Intrinsic::arm_mve_vqdmull_predicated:
+      return true;
+    case Intrinsic::arm_mve_sub_predicated:
+    case Intrinsic::arm_mve_qsub_predicated:
+    case Intrinsic::arm_mve_vhsub:
+    case Intrinsic::arm_mve_hsub_predicated:
+      return N->getOperand(2).getNode() == Op;
+    default:
+      return false;
+    }
+  default:
+    return false;
+  }
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.
 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
@@ -7720,6 +7793,20 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     if (SplatUndef.isAllOnes())
       return DAG.getUNDEF(VT);
 
+    // If all the users of this constant splat are qr instruction variants,
+    // generate a vdup of the constant.
+    if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
+        (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
+        all_of(BVN->uses(),
+               [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
+      EVT DupVT = SplatBitSize == 32   ? MVT::v4i32
+                  : SplatBitSize == 16 ? MVT::v8i16
+                                       : MVT::v16i8;
+      SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
+      SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
+      return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
+    }
+
     if ((ST->hasNEON() && SplatBitSize <= 64) ||
         (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
       // Check if an immediate VMOV works.
@@ -8313,9 +8400,8 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
   SDLoc DL(Op);
 
   SmallVector<SDValue, 8> VTBLMask;
-  for (ArrayRef<int>::iterator
-         I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
-    VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
+  for (int I : ShuffleMask)
+    VTBLMask.push_back(DAG.getConstant(I, DL, MVT::i32));
 
   if (V2.getNode()->isUndef())
     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
@@ -8346,6 +8432,8 @@ static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
 
 static EVT getVectorTyFromPredicateVector(EVT VT) {
   switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::v2i1:
+    return MVT::v2f64;
   case MVT::v4i1:
     return MVT::v4i32;
   case MVT::v8i1:
@@ -8427,7 +8515,14 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
                                           DAG.getUNDEF(NewVT), ShuffleMask);
 
   // Now return the result of comparing the shuffled vector with zero,
-  // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+  // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
+  // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
+  if (VT == MVT::v2i1) {
+    SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
+    SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
+                              DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+    return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
+  }
   return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
 }
@@ -8927,8 +9022,15 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
     ConVec = ExtractInto(NewV1, ConVec, j);
     ConVec = ExtractInto(NewV2, ConVec, j);
 
-    // Now return the result of comparing the subvector with zero,
-    // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+    // Now return the result of comparing the subvector with zero, which will
+    // generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 we
+    // convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
+    if (VT == MVT::v2i1) {
+      SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, ConVec);
+      SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
+                                DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+      return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
+    }
     return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
                        DAG.getConstant(ARMCC::NE, dl, MVT::i32));
   };
@@ -8993,6 +9095,22 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
 
   MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
 
+  if (NumElts == 2) {
+    EVT SubVT = MVT::v4i32;
+    SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
+    for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
+                                DAG.getIntPtrConstant(i, dl));
+      SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
+                           DAG.getConstant(j, dl, MVT::i32));
+      SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
+                           DAG.getConstant(j + 1, dl, MVT::i32));
+    }
+    SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
+                              DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+    return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
+  }
+
   EVT SubVT = MVT::getVectorVT(ElType, NumElts);
   SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
   for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
@@ -9839,16 +9957,17 @@ void ARMTargetLowering::ExpandDIV_Windows(
 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
   LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
   EVT MemVT = LD->getMemoryVT();
-  assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
+  assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
+          MemVT == MVT::v16i1) &&
          "Expected a predicate type!");
   assert(MemVT == Op.getValueType());
   assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
          "Expected a non-extending load");
   assert(LD->isUnindexed() && "Expected a unindexed load");
 
-  // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
+  // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
   // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
-  // need to make sure that 8/4 bits are actually loaded into the correct
+  // need to make sure that 8/4/2 bits are actually loaded into the correct
   // place, which means loading the value and then shuffling the values into
   // the bottom bits of the predicate.
   // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
@@ -9895,14 +10014,15 @@ void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
   StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
   EVT MemVT = ST->getMemoryVT();
-  assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
+  assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
+          MemVT == MVT::v16i1) &&
          "Expected a predicate type!");
   assert(MemVT == ST->getValue().getValueType());
   assert(!ST->isTruncatingStore() && "Expected a non-extending store");
   assert(ST->isUnindexed() && "Expected a unindexed store");
 
-  // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
-  // unset and a scalar store.
+  // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
+  // top bits unset and a scalar store.
   SDLoc dl(Op);
   SDValue Build = ST->getValue();
   if (MemVT != MVT::v16i1) {
@@ -9953,7 +10073,7 @@ static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
                                    {ST->getChain(), Lo, Hi, ST->getBasePtr()},
                                    MemVT, ST->getMemOperand());
   } else if (Subtarget->hasMVEIntegerOps() &&
-             ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
+             ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
                MemVT == MVT::v16i1))) {
     return LowerPredicateStore(Op, DAG);
   }
@@ -10561,25 +10681,23 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   // associated with.
   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
   unsigned MaxCSNum = 0;
-  for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
-       ++BB) {
-    if (!BB->isEHPad()) continue;
+  for (MachineBasicBlock &BB : *MF) {
+    if (!BB.isEHPad())
+      continue;
 
     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
     // pad.
-    for (MachineBasicBlock::iterator
-           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
-      if (!II->isEHLabel()) continue;
+    for (MachineInstr &II : BB) {
+      if (!II.isEHLabel())
+        continue;
 
-      MCSymbol *Sym = II->getOperand(0).getMCSymbol();
+      MCSymbol *Sym = II.getOperand(0).getMCSymbol();
       if (!MF->hasCallSiteLandingPad(Sym)) continue;
 
       SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
-      for (SmallVectorImpl<unsigned>::iterator
-             CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
-           CSI != CSE; ++CSI) {
-        CallSiteNumToLPad[*CSI].push_back(&*BB);
-        MaxCSNum = std::max(MaxCSNum, *CSI);
+      for (unsigned Idx : CallSiteIdxs) {
+        CallSiteNumToLPad[Idx].push_back(&BB);
+        MaxCSNum = std::max(MaxCSNum, Idx);
       }
       break;
     }
@@ -14002,8 +14120,8 @@ static SDValue PerformANDCombine(SDNode *N,
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 ||
-      VT == MVT::v8i1 || VT == MVT::v16i1)
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
+      VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
     return SDValue();
 
   APInt SplatBits, SplatUndef;
@@ -14298,8 +14416,8 @@ static SDValue PerformORCombine(SDNode *N,
   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
-  if (Subtarget->hasMVEIntegerOps() &&
-      (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
+  if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
+                                        VT == MVT::v8i1 || VT == MVT::v16i1))
     return PerformORCombine_i1(N, DAG, Subtarget);
 
   APInt SplatBits, SplatUndef;
@@ -14569,6 +14687,15 @@ static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) {
   if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
     return SDValue();
   SDValue CSInc = Cmp->getOperand(0);
+
+  // Ignore any `And 1` nodes that may not yet have been removed. We are
+  // looking for a value that produces 1/0, so these have no effect on the
+  // code.
+  while (CSInc.getOpcode() == ISD::AND &&
+         isa<ConstantSDNode>(CSInc.getOperand(1)) &&
+         CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
+    CSInc = CSInc.getOperand(0);
+
   if (CSInc.getOpcode() != ARMISD::CSINC ||
       !isNullConstant(CSInc.getOperand(0)) ||
       !isNullConstant(CSInc.getOperand(1)) || !CSInc->hasOneUse())
@@ -17897,6 +18024,23 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   if (!VT.isInteger())
       return SDValue();
 
+  // Fold away an unneccessary CMPZ/CMOV
+  // CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) ->
+  // if C1==EQ -> CMOV A, B, C2, $cpsr, D
+  // if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D
+  if (N->getConstantOperandVal(2) == ARMCC::EQ ||
+      N->getConstantOperandVal(2) == ARMCC::NE) {
+    ARMCC::CondCodes Cond;
+    if (SDValue C = IsCMPZCSINC(N->getOperand(4).getNode(), Cond)) {
+      if (N->getConstantOperandVal(2) == ARMCC::NE)
+        Cond = ARMCC::getOppositeCondition(Cond);
+      return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
+                         N->getOperand(1),
+                         DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32),
+                         N->getOperand(3), C);
+    }
+  }
+
   // Materialize a boolean comparison for integers so we can avoid branching.
   if (isNullConstant(FalseVal)) {
     if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
@@ -18564,7 +18708,8 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
     return false;
 
   // These are for predicates
-  if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) {
+  if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
+       Ty == MVT::v2i1)) {
     if (Fast)
       *Fast = true;
     return true;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index e3b422358cae..1c5f8389f57c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -69,6 +69,7 @@ class VectorType;
     CALL_PRED,   // Function call that's predicable.
     CALL_NOLINK, // Function call with branch not branch-and-link.
     tSECALL,     // CMSE non-secure function call.
+    t2CALL_BTI,  // Thumb function call followed by BTI instruction.
     BRCOND,      // Conditional branch.
     BR_JT,       // Jumptable branch.
     BR2_JT,      // Jumptable branch (2 level - jumptable entry is a jump).
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index f53814a80e01..1ae0354ffc37 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -254,13 +254,6 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec,
   // An LLVM ValueType representing a corresponding vector of
   // predicate bits, for use in ISel patterns that handle an IR
   // intrinsic describing the predicated form of the instruction.
-  //
-  // Usually, for a vector of N things, this will be vNi1. But for
-  // vectors of 2 values, we make an exception, and use v4i1 instead
-  // of v2i1. Rationale: MVE codegen doesn't support doing all the
-  // auxiliary operations on v2i1 (vector shuffles etc), and also,
-  // there's no MVE compare instruction that will _generate_ v2i1
-  // directly.
   ValueType Pred = pred;
 
   // Same as Pred but for DblVec rather than Vec.
@@ -294,25 +287,25 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec,
 // Integer vector types that don't treat signed and unsigned differently.
 def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "i", ?>;
 def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  v4i1, 0b01, "i", ?>;
-def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  v4i1, 0b10, "i", ?>;
-def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?,     v4i1,  ?,    0b11, "i", ?>;
+def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  v2i1, 0b10, "i", ?>;
+def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?,     v2i1,  ?,    0b11, "i", ?>;
 
 // Explicitly signed and unsigned integer vectors. They map to the
 // same set of LLVM ValueTypes as above, but are represented
 // differently in assembly and instruction encodings.
 def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "s", 0b0>;
 def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  v4i1, 0b01, "s", 0b0>;
-def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  v4i1, 0b10, "s", 0b0>;
-def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?,     v4i1,  ?,    0b11, "s", 0b0>;
+def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  v2i1, 0b10, "s", 0b0>;
+def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?,     v2i1,  ?,    0b11, "s", 0b0>;
 def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "u", 0b1>;
 def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  v4i1, 0b01, "u", 0b1>;
-def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  v4i1, 0b10, "u", 0b1>;
-def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?,     v4i1,  ?,    0b11, "u", 0b1>;
+def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  v2i1, 0b10, "u", 0b1>;
+def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?,     v2i1,  ?,    0b11, "u", 0b1>;
 
 // FP vector types.
 def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1,  v4i1, 0b01, "f", ?>;
-def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1,  v4i1, 0b10, "f", ?>;
-def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?,     v4i1,  ?,    0b11, "f", ?>;
+def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1,  v2i1, 0b10, "f", ?>;
+def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?,     v2i1,  ?,    0b11, "f", ?>;
 
 // Polynomial vector types.
 def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b11, "p", 0b0>;
@@ -2260,6 +2253,31 @@ let Predicates = [HasMVEInt] in {
                                         (v4i32 (ARMvmovImm (i32 1)))),
                                 (i32 1))),
             (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>;
+
+  def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
+                                        (v16i8 (ARMvdup (i32 1)))),
+                                (i32 1))),
+            (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>;
+  def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
+                                        (v8i16 (ARMvdup (i32 1)))),
+                                (i32 1))),
+            (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>;
+  def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
+                                        (v4i32 (ARMvdup (i32 1)))),
+                                (i32 1))),
+            (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>;
+  def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
+                                        (v16i8 (ARMvdup (i32 1)))),
+                                (i32 1))),
+            (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>;
+  def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
+                                        (v8i16 (ARMvdup (i32 1)))),
+                                (i32 1))),
+            (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>;
+  def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
+                                        (v4i32 (ARMvdup (i32 1)))),
+                                (i32 1))),
+            (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>;
 }
 
 
@@ -4450,6 +4468,11 @@ multiclass two_predops<SDPatternOperator opnode, Instruction insn> {
                           (insn (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p1), rGPR)),
                                 (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p2), rGPR))),
                           VCCR))>;
+  def v2i1  : Pat<(v2i1 (opnode (v2i1 VCCR:$p1), (v2i1 VCCR:$p2))),
+                  (v2i1 (COPY_TO_REGCLASS
+                          (insn (i32 (COPY_TO_REGCLASS (v2i1 VCCR:$p1), rGPR)),
+                                (i32 (COPY_TO_REGCLASS (v2i1 VCCR:$p2), rGPR))),
+                          VCCR))>;
 }
 
 let Predicates = [HasMVEInt] in {
@@ -4469,20 +4492,20 @@ def load_align4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
 }]>;
 
 let Predicates = [HasMVEInt] in {
-  foreach VT = [ v4i1, v8i1, v16i1 ] in {
+  foreach VT = [ v2i1, v4i1, v8i1, v16i1 ] in {
     def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
               (i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>;
     def : Pat<(VT  (predicate_cast (i32 VCCR:$src))),
               (VT  (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>;
 
-    foreach VT2 = [ v4i1, v8i1, v16i1 ] in
+    foreach VT2 = [ v2i1, v4i1, v8i1, v16i1 ] in
       def : Pat<(VT  (predicate_cast (VT2 VCCR:$src))),
                 (VT  (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
   }
 
   // If we happen to be casting from a load we can convert that straight
   // into a predicate load, so long as the load is of the correct type.
-  foreach VT = [ v4i1, v8i1, v16i1 ] in {
+  foreach VT = [ v2i1, v4i1, v8i1, v16i1 ] in {
     def : Pat<(VT (predicate_cast (i32 (load_align4 taddrmode_imm7<2>:$addr)))),
               (VT (VLDR_P0_off taddrmode_imm7<2>:$addr))>;
   }
@@ -5350,33 +5373,40 @@ class MVE_VxADDSUB_qr<string iname, string suffix,
 }
 
 multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
-                             Intrinsic unpred_int, Intrinsic pred_int> {
+                             Intrinsic unpred_int, Intrinsic pred_int, PatFrag add_op,
+                             SDNode shift_op> {
   def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract, VTI.Size>;
   defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME),
                                   VTI, unpred_int, pred_int, 1, 1>;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn))), (i32 1))),
+              (Inst MQPR:$Qm, rGPR:$Rn)>;
+  }
 }
 
-multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI> :
-  MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd,
-                                       int_arm_mve_hadd_predicated>;
+multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op> :
+  MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd, int_arm_mve_hadd_predicated,
+                    add_op, shift_op>;
 
-multiclass MVE_VHSUB_qr_m<MVEVectorVTInfo VTI> :
-  MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub,
-                                       int_arm_mve_hsub_predicated>;
+multiclass MVE_VHSUB_qr_m<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op> :
+  MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub, int_arm_mve_hsub_predicated,
+                    add_op, shift_op>;
 
-defm MVE_VHADD_qr_s8  : MVE_VHADD_qr_m<MVE_v16s8>;
-defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16>;
-defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32>;
-defm MVE_VHADD_qr_u8  : MVE_VHADD_qr_m<MVE_v16u8>;
-defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16>;
-defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32>;
+defm MVE_VHADD_qr_s8  : MVE_VHADD_qr_m<MVE_v16s8, addnsw, ARMvshrsImm>;
+defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16, addnsw, ARMvshrsImm>;
+defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32, addnsw, ARMvshrsImm>;
+defm MVE_VHADD_qr_u8  : MVE_VHADD_qr_m<MVE_v16u8, addnuw, ARMvshruImm>;
+defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16, addnuw, ARMvshruImm>;
+defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32, addnuw, ARMvshruImm>;
 
-defm MVE_VHSUB_qr_s8  : MVE_VHSUB_qr_m<MVE_v16s8>;
-defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m<MVE_v8s16>;
-defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m<MVE_v4s32>;
-defm MVE_VHSUB_qr_u8  : MVE_VHSUB_qr_m<MVE_v16u8>;
-defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>;
-defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>;
+defm MVE_VHSUB_qr_s8  : MVE_VHSUB_qr_m<MVE_v16s8, subnsw, ARMvshrsImm>;
+defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m<MVE_v8s16, subnsw, ARMvshrsImm>;
+defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m<MVE_v4s32, subnsw, ARMvshrsImm>;
+defm MVE_VHSUB_qr_u8  : MVE_VHSUB_qr_m<MVE_v16u8, subnuw, ARMvshruImm>;
+defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16, subnuw, ARMvshruImm>;
+defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32, subnuw, ARMvshruImm>;
 
 multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract,
                             SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> {
@@ -6778,11 +6808,15 @@ let Predicates = [HasMVEInt] in {
             (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
             (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>;
+  def : Pat<(v2i64 (vselect (v2i1 VCCR:$pred), (v2i64 MQPR:$v1), (v2i64 MQPR:$v2))),
+            (v2i64 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>;
 
   def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
             (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
             (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>;
+  def : Pat<(v2f64 (vselect (v2i1 VCCR:$pred), (v2f64 MQPR:$v1), (v2f64 MQPR:$v2))),
+            (v2f64 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>;
 
   def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
             (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
@@ -6808,6 +6842,8 @@ let Predicates = [HasMVEInt] in {
             (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v4i32 (zext  (v4i1  VCCR:$pred))),
             (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
+  def : Pat<(v2i64 (zext  (v2i1  VCCR:$pred))),
+            (v2i64 (MVE_VPSEL (MVE_VMOVimmi64 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
 
   def : Pat<(v16i8 (sext  (v16i1 VCCR:$pred))),
             (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
@@ -6815,6 +6851,8 @@ let Predicates = [HasMVEInt] in {
             (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v4i32 (sext  (v4i1  VCCR:$pred))),
             (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
+  def : Pat<(v2i64 (sext  (v2i1  VCCR:$pred))),
+            (v2i64 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
 
   def : Pat<(v16i8 (anyext  (v16i1 VCCR:$pred))),
             (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
@@ -6822,6 +6860,8 @@ let Predicates = [HasMVEInt] in {
             (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
   def : Pat<(v4i32 (anyext  (v4i1  VCCR:$pred))),
             (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
+  def : Pat<(v2i64 (anyext  (v2i1  VCCR:$pred))),
+            (v2i64 (MVE_VPSEL (MVE_VMOVimmi64 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>;
 }
 
 let Predicates = [HasMVEFloat] in {
@@ -6862,6 +6902,8 @@ def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary,
 }
 
 let Predicates = [HasMVEInt] in {
+  def : Pat<(v2i1 (xor (v2i1 VCCR:$pred), (v2i1 (predicate_cast (i32 65535))))),
+            (v2i1 (MVE_VPNOT (v2i1 VCCR:$pred)))>;
   def : Pat<(v4i1 (xor (v4i1 VCCR:$pred), (v4i1 (predicate_cast (i32 65535))))),
             (v4i1 (MVE_VPNOT (v4i1 VCCR:$pred)))>;
   def : Pat<(v8i1 (xor (v8i1 VCCR:$pred), (v8i1 (predicate_cast (i32 65535))))),
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 4471317f4ea4..6e8e61ca2b8e 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -5736,3 +5736,10 @@ def t2BTI    : PACBTIHintSpaceNoOpsInst<"bti", 0b00001111>;
 def t2AUT    : PACBTIHintSpaceUseInst<"aut", 0b00101101> {
   let hasSideEffects = 1;
 }
+
+def ARMt2CallBTI : SDNode<"ARMISD::t2CALL_BTI", SDT_ARMcall,
+                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+def t2CALL_BTI : PseudoInst<(outs), (ins pred:$p, thumb_bl_target:$func),
+                 IIC_Br, [(ARMt2CallBTI tglobaladdr:$func)]>,
+                 Requires<[IsThumb2]>, Sched<[WriteBrL]>;
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 9d1bfa414dff..dc5f1b92a6c2 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -1076,6 +1076,9 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm,
     }
   }
 
+  def : InstAlias<!strconcat("vrint", opc, ".f16.f16\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"H") HPR:$Sd, HPR:$Sm), 0>,
+        Requires<[HasFullFP16]>;
   def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"),
                   (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm), 0>,
         Requires<[HasFPARMv8]>;
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 3b10c60a0654..ef5fc12feb54 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2121,7 +2121,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   bool Modified = false;
   for (MachineBasicBlock &MBB : Fn) {
     Modified |= LoadStoreMultipleOpti(MBB);
-    if (STI->hasV5TOps())
+    if (STI->hasV5TOps() && !AFI->shouldSignReturnAddress())
       Modified |= MergeReturnIntoLDM(MBB);
     if (isThumb1)
       Modified |= CombineMovBx(MBB);
@@ -2349,9 +2349,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
     unsigned LastOpcode = 0;
     unsigned LastBytes = 0;
     unsigned NumMove = 0;
-    for (int i = Ops.size() - 1; i >= 0; --i) {
+    for (MachineInstr *Op : llvm::reverse(Ops)) {
       // Make sure each operation has the same kind.
-      MachineInstr *Op = Ops[i];
       unsigned LSMOpcode
         = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
       if (LastOpcode && LSMOpcode != LastOpcode)
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 3874db5792d6..f822672c4477 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -251,10 +251,7 @@ namespace {
       SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
       if (Exclusive && Predicates.size() != 1)
         return false;
-      for (auto *PredMI : Predicates)
-        if (isVCTP(PredMI))
-          return true;
-      return false;
+      return llvm::any_of(Predicates, isVCTP);
     }
 
     // Is the VPST, controlling the block entry, predicated upon a VCTP.
@@ -351,10 +348,7 @@ namespace {
     }
 
     bool containsVCTP() const {
-      for (auto *MI : Insts)
-        if (isVCTP(MI))
-          return true;
-      return false;
+      return llvm::any_of(Insts, isVCTP);
     }
 
     unsigned size() const { return Insts.size(); }
@@ -1334,8 +1328,8 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
   bool Changed = false;
 
   // Process inner loops first.
-  for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
-    Changed |= ProcessLoop(*I);
+  for (MachineLoop *L : *ML)
+    Changed |= ProcessLoop(L);
 
   LLVM_DEBUG({
     dbgs() << "ARM Loops: Processing loop containing:\n";
@@ -1699,7 +1693,7 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
         // If any of the instructions between the VCMP and VPST are predicated
         // then a different code path is expected to have merged the VCMP and
         // VPST already.
-        if (!std::any_of(++MachineBasicBlock::iterator(VCMP),
+        if (std::none_of(++MachineBasicBlock::iterator(VCMP),
                          MachineBasicBlock::iterator(VPST), hasVPRUse) &&
             RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
             RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 4077fc058217..d8d937055d23 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -289,7 +289,7 @@ public:
       return false;
     if (SignReturnAddressAll)
       return true;
-    return LRSpilled;
+    return SpillsLR;
   }
 
   bool branchTargetEnforcement() const { return BranchTargetEnforcement; }
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 760a5a5a20cf..194d65cad8d1 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -211,6 +211,8 @@ def FPCXTS  : ARMReg<15, "fpcxts">;
 
 def ZR  : ARMReg<15, "zr">,  DwarfRegNum<[15]>;
 
+def RA_AUTH_CODE : ARMReg<12, "ra_auth_code">, DwarfRegNum<[143]>;
+
 // Register classes.
 //
 // pc  == Program Counter
@@ -395,7 +397,7 @@ def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
 }
 
 // MVE Condition code register.
-def VCCR : RegisterClass<"ARM", [i32, v16i1, v8i1, v4i1], 32, (add VPR)> {
+def VCCR : RegisterClass<"ARM", [i32, v16i1, v8i1, v4i1, v2i1], 32, (add VPR)> {
 //  let CopyCost = -1;  // Don't allow copying of status registers.
 }
 
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index d51a888c951f..e61b90af31b0 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -18,6 +18,7 @@
 #include "ARMConstantPoolValue.h"
 #include "ARMFrameLowering.h"
 #include "ARMISelLowering.h"
+#include "ARMMachineFunctionInfo.h"
 #include "ARMSelectionDAGInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -534,6 +535,10 @@ protected:
   /// Selected instruction itineraries (one entry per itinerary class.)
   InstrItineraryData InstrItins;
 
+  /// NoBTIAtReturnTwice - Don't place a BTI instruction after
+  /// return-twice constructs (setjmp)
+  bool NoBTIAtReturnTwice = false;
+
   /// Options passed via command line that could influence the target
   const TargetOptions &Options;
 
@@ -840,6 +845,8 @@ public:
   /// to lr. This is always required on Thumb1-only targets, as the push and
   /// pop instructions can't access the high registers.
   bool splitFramePushPop(const MachineFunction &MF) const {
+    if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress())
+      return true;
     return (getFramePointerReg() == ARM::R7 &&
             MF.getTarget().Options.DisableFramePointerElim(MF)) ||
            isThumb1Only();
@@ -948,6 +955,8 @@ public:
   bool hardenSlsRetBr() const { return HardenSlsRetBr; }
   bool hardenSlsBlr() const { return HardenSlsBlr; }
   bool hardenSlsNoComdat() const { return HardenSlsNoComdat; }
+
+  bool getNoBTIAtReturnTwice() const { return NoBTIAtReturnTwice; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 39f407ba7149..bfe078b06861 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -137,21 +137,18 @@ public:
   int getFPReg() const { return FPReg; }
 
   void emitFnStartLocNotes() const {
-    for (Locs::const_iterator FI = FnStartLocs.begin(), FE = FnStartLocs.end();
-         FI != FE; ++FI)
-      Parser.Note(*FI, ".fnstart was specified here");
+    for (const SMLoc &Loc : FnStartLocs)
+      Parser.Note(Loc, ".fnstart was specified here");
   }
 
   void emitCantUnwindLocNotes() const {
-    for (Locs::const_iterator UI = CantUnwindLocs.begin(),
-                              UE = CantUnwindLocs.end(); UI != UE; ++UI)
-      Parser.Note(*UI, ".cantunwind was specified here");
+    for (const SMLoc &Loc : CantUnwindLocs)
+      Parser.Note(Loc, ".cantunwind was specified here");
   }
 
   void emitHandlerDataLocNotes() const {
-    for (Locs::const_iterator HI = HandlerDataLocs.begin(),
-                              HE = HandlerDataLocs.end(); HI != HE; ++HI)
-      Parser.Note(*HI, ".handlerdata was specified here");
+    for (const SMLoc &Loc : HandlerDataLocs)
+      Parser.Note(Loc, ".handlerdata was specified here");
   }
 
   void emitPersonalityLocNotes() const {
@@ -452,7 +449,8 @@ class ARMAsmParser : public MCTargetAsmParser {
   int tryParseRegister();
   bool tryParseRegisterWithWriteBack(OperandVector &);
   int tryParseShiftRegister(OperandVector &);
-  bool parseRegisterList(OperandVector &, bool EnforceOrder = true);
+  bool parseRegisterList(OperandVector &, bool EnforceOrder = true,
+                         bool AllowRAAC = false);
   bool parseMemory(OperandVector &);
   bool parseOperand(OperandVector &, StringRef Mnemonic);
   bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
@@ -2572,17 +2570,15 @@ public:
   void addRegListOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const SmallVectorImpl<unsigned> &RegList = getRegList();
-    for (SmallVectorImpl<unsigned>::const_iterator
-           I = RegList.begin(), E = RegList.end(); I != E; ++I)
-      Inst.addOperand(MCOperand::createReg(*I));
+    for (unsigned Reg : RegList)
+      Inst.addOperand(MCOperand::createReg(Reg));
   }
 
   void addRegListWithAPSROperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const SmallVectorImpl<unsigned> &RegList = getRegList();
-    for (SmallVectorImpl<unsigned>::const_iterator
-           I = RegList.begin(), E = RegList.end(); I != E; ++I)
-      Inst.addOperand(MCOperand::createReg(*I));
+    for (unsigned Reg : RegList)
+      Inst.addOperand(MCOperand::createReg(Reg));
   }
 
   void addDPRRegListOperands(MCInst &Inst, unsigned N) const {
@@ -4464,8 +4460,8 @@ insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
 }
 
 /// Parse a register list.
-bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
-                                     bool EnforceOrder) {
+bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
+                                     bool AllowRAAC) {
   MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::LCurly))
     return TokError("Token is not a Left Curly Brace");
@@ -4478,7 +4474,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
   int Reg = tryParseRegister();
   if (Reg == -1)
     return Error(RegLoc, "register expected");
-
+  if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE)
+    return Error(RegLoc, "pseudo-register not allowed");
   // The reglist instructions have at most 16 registers, so reserve
   // space for that many.
   int EReg = 0;
@@ -4492,7 +4489,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
     ++Reg;
   }
   const MCRegisterClass *RC;
-  if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+  if (Reg == ARM::RA_AUTH_CODE ||
+      ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
     RC = &ARMMCRegisterClasses[ARM::GPRRegClassID];
   else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg))
     RC = &ARMMCRegisterClasses[ARM::DPRRegClassID];
@@ -4513,11 +4511,15 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
   while (Parser.getTok().is(AsmToken::Comma) ||
          Parser.getTok().is(AsmToken::Minus)) {
     if (Parser.getTok().is(AsmToken::Minus)) {
+      if (Reg == ARM::RA_AUTH_CODE)
+        return Error(RegLoc, "pseudo-register not allowed");
       Parser.Lex(); // Eat the minus.
       SMLoc AfterMinusLoc = Parser.getTok().getLoc();
       int EndReg = tryParseRegister();
       if (EndReg == -1)
         return Error(AfterMinusLoc, "register expected");
+      if (EndReg == ARM::RA_AUTH_CODE)
+        return Error(AfterMinusLoc, "pseudo-register not allowed");
       // Allow Q regs and just interpret them as the two D sub-registers.
       if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
         EndReg = getDRegFromQReg(EndReg) + 1;
@@ -4526,7 +4528,9 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
       if (Reg == EndReg)
         continue;
       // The register must be in the same register class as the first.
-      if (!RC->contains(EndReg))
+      if ((Reg == ARM::RA_AUTH_CODE &&
+           RC != &ARMMCRegisterClasses[ARM::GPRRegClassID]) ||
+          (Reg != ARM::RA_AUTH_CODE && !RC->contains(Reg)))
         return Error(AfterMinusLoc, "invalid register in register list");
       // Ranges must go from low to high.
       if (MRI->getEncodingValue(Reg) > MRI->getEncodingValue(EndReg))
@@ -4551,13 +4555,15 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
     Reg = tryParseRegister();
     if (Reg == -1)
       return Error(RegLoc, "register expected");
+    if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE)
+      return Error(RegLoc, "pseudo-register not allowed");
     // Allow Q regs and just interpret them as the two D sub-registers.
     bool isQReg = false;
     if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
       Reg = getDRegFromQReg(Reg);
       isQReg = true;
     }
-    if (!RC->contains(Reg) &&
+    if (Reg != ARM::RA_AUTH_CODE && !RC->contains(Reg) &&
         RC->getID() == ARMMCRegisterClasses[ARM::GPRRegClassID].getID() &&
         ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg)) {
       // switch the register classes, as GPRwithAPSRnospRegClassID is a partial
@@ -4577,7 +4583,9 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands,
       continue;
     }
     // The register must be in the same register class as the first.
-    if (!RC->contains(Reg))
+    if ((Reg == ARM::RA_AUTH_CODE &&
+         RC != &ARMMCRegisterClasses[ARM::GPRRegClassID]) ||
+        (Reg != ARM::RA_AUTH_CODE && !RC->contains(Reg)))
       return Error(RegLoc, "invalid register in register list");
     // In most cases, the list must be monotonically increasing. An
     // exception is CLRM, which is order-independent anyway, so
@@ -7106,13 +7114,12 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       return Error(Loc, "too many conditions on VPT instruction");
     }
     unsigned Mask = 8;
-    for (unsigned i = ITMask.size(); i != 0; --i) {
-      char pos = ITMask[i - 1];
-      if (pos != 't' && pos != 'e') {
+    for (char Pos : llvm::reverse(ITMask)) {
+      if (Pos != 't' && Pos != 'e') {
         return Error(Loc, "illegal IT block condition mask '" + ITMask + "'");
       }
       Mask >>= 1;
-      if (ITMask[i - 1] == 'e')
+      if (Pos == 'e')
         Mask |= 8;
     }
     Operands.push_back(ARMOperand::CreateITMask(Mask, Loc));
@@ -11685,7 +11692,7 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
 
   // Parse the register list
-  if (parseRegisterList(Operands) ||
+  if (parseRegisterList(Operands, true, true) ||
       parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
     return true;
   ARMOperand &Op = (ARMOperand &)*Operands[0];
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 896b104e8d97..e060e59e3759 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -1289,34 +1289,65 @@ void ARMELFStreamer::emitPad(int64_t Offset) {
   PendingOffset -= Offset;
 }
 
-void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                                 bool IsVector) {
-  // Collect the registers in the register list
-  unsigned Count = 0;
+static std::pair<unsigned, unsigned>
+collectHWRegs(const MCRegisterInfo &MRI, unsigned Idx,
+              const SmallVectorImpl<unsigned> &RegList, bool IsVector,
+              uint32_t &Mask_) {
   uint32_t Mask = 0;
-  const MCRegisterInfo *MRI = getContext().getRegisterInfo();
-  for (size_t i = 0; i < RegList.size(); ++i) {
-    unsigned Reg = MRI->getEncodingValue(RegList[i]);
+  unsigned Count = 0;
+  while (Idx > 0) {
+    unsigned Reg = RegList[Idx - 1];
+    if (Reg == ARM::RA_AUTH_CODE)
+      break;
+    Reg = MRI.getEncodingValue(Reg);
     assert(Reg < (IsVector ? 32U : 16U) && "Register out of range");
     unsigned Bit = (1u << Reg);
     if ((Mask & Bit) == 0) {
       Mask |= Bit;
       ++Count;
     }
+    --Idx;
   }
 
-  // Track the change the $sp offset: For the .save directive, the
-  // corresponding push instruction will decrease the $sp by (4 * Count).
-  // For the .vsave directive, the corresponding vpush instruction will
-  // decrease $sp by (8 * Count).
-  SPOffset -= Count * (IsVector ? 8 : 4);
+  Mask_ = Mask;
+  return {Idx, Count};
+}
 
-  // Emit the opcode
-  FlushPendingOffset();
-  if (IsVector)
-    UnwindOpAsm.EmitVFPRegSave(Mask);
-  else
-    UnwindOpAsm.EmitRegSave(Mask);
+void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                 bool IsVector) {
+  uint32_t Mask;
+  unsigned Idx, Count;
+  const MCRegisterInfo &MRI = *getContext().getRegisterInfo();
+
+  // Collect the registers in the register list. Issue unwinding instructions in
+  // three parts: ordinary hardware registers, return address authentication
+  // code pseudo register, the rest of the registers. The RA PAC is kept in an
+  // architectural register (usually r12), but we treat it as a special case in
+  // order to distinguish between that register containing RA PAC or a general
+  // value.
+  Idx = RegList.size();
+  while (Idx > 0) {
+    std::tie(Idx, Count) = collectHWRegs(MRI, Idx, RegList, IsVector, Mask);
+    if (Count) {
+      // Track the change the $sp offset: For the .save directive, the
+      // corresponding push instruction will decrease the $sp by (4 * Count).
+      // For the .vsave directive, the corresponding vpush instruction will
+      // decrease $sp by (8 * Count).
+      SPOffset -= Count * (IsVector ? 8 : 4);
+
+      // Emit the opcode
+      FlushPendingOffset();
+      if (IsVector)
+        UnwindOpAsm.EmitVFPRegSave(Mask);
+      else
+        UnwindOpAsm.EmitRegSave(Mask);
+    } else if (Idx > 0 && RegList[Idx - 1] == ARM::RA_AUTH_CODE) {
+      --Idx;
+      SPOffset -= 4;
+      FlushPendingOffset();
+      UnwindOpAsm.EmitRegSave(0);
+    }
+  }
 }
 
 void ARMELFStreamer::emitUnwindRaw(int64_t Offset,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
index 781627c3c425..50f416b23db2 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -64,8 +64,11 @@ namespace {
 } // end anonymous namespace
 
 void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) {
-  if (RegSave == 0u)
+  if (RegSave == 0u) {
+    // That's the special case for RA PAC.
+    EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_RA_AUTH_CODE);
     return;
+  }
 
   // One byte opcode to save register r14 and r11-r4
   if (RegSave & (1u << 4)) {
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index dc58b5427425..7e31ea77f4f5 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -366,7 +366,7 @@ bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
     while (!Worklist.empty()) {
       Register Reg = Worklist.pop_back_val();
       for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
-        if (count(ExpectedUsers, &MI))
+        if (llvm::is_contained(ExpectedUsers, &MI))
           continue;
         if (MI.getOpcode() != TargetOpcode::COPY ||
             !MI.getOperand(0).getReg().isVirtual()) {
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 6a5bc9284266..0e6960bce32b 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -213,7 +213,8 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
   auto *TC = SE->getSCEV(TripCount);
   int VectorWidth =
       cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements();
-  if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16)
+  if (VectorWidth != 2 && VectorWidth != 4 && VectorWidth != 8 &&
+      VectorWidth != 16)
     return false;
   ConstantInt *ConstElemCount = nullptr;
 
@@ -371,15 +372,10 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
   switch (VectorWidth) {
   default:
     llvm_unreachable("unexpected number of lanes");
+  case 2:  VCTPID = Intrinsic::arm_mve_vctp64; break;
   case 4:  VCTPID = Intrinsic::arm_mve_vctp32; break;
   case 8:  VCTPID = Intrinsic::arm_mve_vctp16; break;
   case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
-
-    // FIXME: vctp64 currently not supported because the predicate
-    // vector wants to be <2 x i1>, but v2i1 is not a legal MVE
-    // type, so problems happen at isel time.
-    // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics
-    // purposes, but takes a v4i1 instead of a v2i1.
   }
   Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
   Value *VCTPCall = Builder.CreateCall(VCTP, Processed);
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 224c61b9f065..54e80a095dd4 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -824,8 +824,8 @@ bool Thumb1FrameLowering::spillCalleeSavedRegisters(
   ARMRegSet CopyRegs;     // Registers which can be used after pushing
                           // LoRegs for saving HiRegs.
 
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i-1].getReg();
+  for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
+    unsigned Reg = I.getReg();
 
     if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) {
       LoRegsToSave[Reg] = true;
@@ -1021,8 +1021,7 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
       BuildMI(MF, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
 
   bool NeedsPop = false;
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    CalleeSavedInfo &Info = CSI[i-1];
+  for (CalleeSavedInfo &Info : llvm::reverse(CSI)) {
     unsigned Reg = Info.getReg();
 
     // High registers (excluding lr) have already been dealt with
@@ -1067,7 +1066,7 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
   if (NeedsPop)
     MBB.insert(MI, &*MIB);
   else
-    MF.DeleteMachineInstr(MIB);
+    MF.deleteMachineInstr(MIB);
 
   return true;
 }
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
index 672611ea2234..543d94875037 100644
--- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -247,8 +247,8 @@ bool AVRFrameLowering::spillCalleeSavedRegisters(
   const TargetInstrInfo &TII = *STI.getInstrInfo();
   AVRMachineFunctionInfo *AVRFI = MF.getInfo<AVRMachineFunctionInfo>();
 
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i - 1].getReg();
+  for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
+    unsigned Reg = I.getReg();
     bool IsNotLiveIn = !MBB.isLiveIn(Reg);
 
     assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 &&
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index 798d08393eae..51060018a5ca 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -571,8 +571,6 @@ void AVRInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   // See lib/CodeGen/RegisterRelaxation.cpp for details.
   // We end up here when a jump is too long for a RJMP instruction.
   BuildMI(&MBB, DL, get(AVR::JMPk)).addMBB(&NewDestBB);
-
-  return;
 }
 
 } // end of namespace llvm
diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index 0348e2200acb..36237b2fc4fd 100644
--- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -93,8 +93,13 @@ static bool BPFPreserveDITypeImpl(Function &F) {
         Ty = DTy->getBaseType();
       }
 
-      if (Ty->getName().empty())
-        report_fatal_error("Empty type name for BTF_TYPE_ID_REMOTE reloc");
+      if (Ty->getName().empty()) {
+        if (isa<DISubroutineType>(Ty))
+          report_fatal_error(
+              "SubroutineType not supported for BTF_TYPE_ID_REMOTE reloc");
+        else
+          report_fatal_error("Empty type name for BTF_TYPE_ID_REMOTE reloc");
+      }
       MD = Ty;
     }
 
diff --git a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
index ebc04b40d428..29b99a84a6cd 100644
--- a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
+++ b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
@@ -11,6 +11,7 @@
 #include "MCTargetDesc/CSKYMCTargetDesc.h"
 #include "TargetInfo/CSKYTargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/MC/MCContext.h"
@@ -25,11 +26,24 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 
+using namespace llvm;
+
 #define DEBUG_TYPE "csky-asm-parser"
 
-using namespace llvm;
+// Include the auto-generated portion of the compress emitter.
+#define GEN_COMPRESS_INSTR
+#include "CSKYGenCompressInstEmitter.inc"
+
+STATISTIC(CSKYNumInstrsCompressed,
+          "Number of C-SKY Compressed instructions emitted");
+
+static cl::opt<bool>
+    EnableCompressedInst("enable-csky-asm-compressed-inst", cl::Hidden,
+                         cl::init(false),
+                         cl::desc("Enable C-SKY asm compressed instruction"));
 
 namespace {
 struct CSKYOperand;
@@ -55,6 +69,10 @@ class CSKYAsmParser : public MCTargetAsmParser {
 
   bool ParseDirective(AsmToken DirectiveID) override;
 
+  // Helper to actually emit an instruction to the MCStreamer. Also, when
+  // possible, compression of the instruction is performed.
+  void emitToStreamer(MCStreamer &S, const MCInst &Inst);
+
   OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                         SMLoc &EndLoc) override;
 
@@ -264,12 +282,6 @@ public:
   bool isConstpool() const { return isConstPoolOp(); }
   bool isDataSymbol() const { return isConstPoolOp(); }
 
-  bool isSPOperand() const {
-    if (!isReg())
-      return false;
-    return getReg() == CSKY::R14;
-  }
-
   bool isPSRFlag() const {
     int64_t Imm;
     // Must be of 'immediate' type and a constant.
@@ -755,10 +767,6 @@ bool CSKYAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "register is out of range");
   }
-  case Match_InvalidSPOperand: {
-    SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
-    return Error(ErrorLoc, "operand must be sp register");
-  }
   case Match_RequiresSameSrcAndDst: {
     SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "src and dst operand must be same");
@@ -776,27 +784,62 @@ bool CSKYAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                                        OperandVector &Operands,
                                        MCStreamer &Out) {
 
-  if (Inst.getOpcode() == CSKY::LDQ32 || Inst.getOpcode() == CSKY::STQ32) {
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case CSKY::LDQ32:
+  case CSKY::STQ32:
     if (Inst.getOperand(1).getReg() != CSKY::R4 ||
         Inst.getOperand(2).getReg() != CSKY::R7) {
       return Error(IDLoc, "Register sequence is not valid. 'r4-r7' expected");
     }
     Inst.setOpcode(Inst.getOpcode() == CSKY::LDQ32 ? CSKY::LDM32 : CSKY::STM32);
-    Out.emitInstruction(Inst, getSTI());
-    return false;
-  } else if (Inst.getOpcode() == CSKY::SEXT32 ||
-             Inst.getOpcode() == CSKY::ZEXT32) {
+    break;
+  case CSKY::SEXT32:
+  case CSKY::ZEXT32:
     if (Inst.getOperand(2).getImm() < Inst.getOperand(3).getImm())
       return Error(IDLoc, "msb must be greater or equal to lsb");
-  } else if (Inst.getOpcode() == CSKY::INS32) {
+    break;
+  case CSKY::INS32:
     if (Inst.getOperand(3).getImm() < Inst.getOperand(4).getImm())
       return Error(IDLoc, "msb must be greater or equal to lsb");
-  } else if (Inst.getOpcode() == CSKY::IDLY32) {
+    break;
+  case CSKY::IDLY32:
     if (Inst.getOperand(0).getImm() > 32 || Inst.getOperand(0).getImm() < 0)
       return Error(IDLoc, "n must be in range [0,32]");
+    break;
+  case CSKY::ADDC32:
+  case CSKY::SUBC32:
+  case CSKY::ADDC16:
+  case CSKY::SUBC16:
+    Inst.erase(std::next(Inst.begin()));
+    Inst.erase(std::prev(Inst.end()));
+    Inst.insert(std::next(Inst.begin()), MCOperand::createReg(CSKY::C));
+    Inst.insert(Inst.end(), MCOperand::createReg(CSKY::C));
+    break;
+  case CSKY::CMPNEI32:
+  case CSKY::CMPNEI16:
+  case CSKY::CMPNE32:
+  case CSKY::CMPNE16:
+  case CSKY::CMPHSI32:
+  case CSKY::CMPHSI16:
+  case CSKY::CMPHS32:
+  case CSKY::CMPHS16:
+  case CSKY::CMPLTI32:
+  case CSKY::CMPLTI16:
+  case CSKY::CMPLT32:
+  case CSKY::CMPLT16:
+  case CSKY::BTSTI32:
+    Inst.erase(Inst.begin());
+    Inst.insert(Inst.begin(), MCOperand::createReg(CSKY::C));
+    break;
+  case CSKY::MVCV32:
+    Inst.erase(std::next(Inst.begin()));
+    Inst.insert(Inst.end(), MCOperand::createReg(CSKY::C));
+    break;
   }
 
-  Out.emitInstruction(Inst, getSTI());
+  emitToStreamer(Out, Inst);
   return false;
 }
 
@@ -1422,6 +1465,16 @@ OperandMatchResultTy CSKYAsmParser::tryParseRegister(unsigned &RegNo,
 
 bool CSKYAsmParser::ParseDirective(AsmToken DirectiveID) { return true; }
 
+void CSKYAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
+  MCInst CInst;
+  bool Res = false;
+  if (EnableCompressedInst)
+    Res = compressInst(CInst, Inst, getSTI(), S.getContext());
+  if (Res)
+    ++CSKYNumInstrsCompressed;
+  S.emitInstruction((Res ? CInst : Inst), getSTI());
+}
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYAsmParser() {
   RegisterMCAsmParser<CSKYAsmParser> X(getTheCSKYTarget());
 }
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
index 1c38c5d1fde6..85129f78e726 100644
--- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
@@ -30,6 +30,9 @@ using namespace llvm;
 
 #define DEBUG_TYPE "csky-asm-printer"
 
+STATISTIC(CSKYNumInstrsCompressed,
+          "Number of C-SKY Compressed instructions emitted");
+
 CSKYAsmPrinter::CSKYAsmPrinter(llvm::TargetMachine &TM,
                                std::unique_ptr<llvm::MCStreamer> Streamer)
     : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this) {}
@@ -39,6 +42,16 @@ bool CSKYAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return AsmPrinter::runOnMachineFunction(MF);
 }
 
+#define GEN_COMPRESS_INSTR
+#include "CSKYGenCompressInstEmitter.inc"
+void CSKYAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
+  MCInst CInst;
+  bool Res = compressInst(CInst, Inst, *Subtarget, OutStreamer->getContext());
+  if (Res)
+    ++CSKYNumInstrsCompressed;
+  AsmPrinter::EmitToStreamer(*OutStreamer, Res ? CInst : Inst);
+}
+
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "CSKYGenMCPseudoLowering.inc"
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
index f0f5d8657c04..b30311e0ca64 100644
--- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
@@ -26,6 +26,8 @@ public:
 
   StringRef getPassName() const override { return "CSKY Assembly Printer"; }
 
+  void EmitToStreamer(MCStreamer &S, const MCInst &Inst);
+
   /// tblgen'erated driver function for lowering simple MI->MC
   /// pseudo instructions.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
diff --git a/llvm/lib/Target/CSKY/CSKYCallingConv.td b/llvm/lib/Target/CSKY/CSKYCallingConv.td
index 87e2e6b9dc31..91102e3714df 100644
--- a/llvm/lib/Target/CSKY/CSKYCallingConv.td
+++ b/llvm/lib/Target/CSKY/CSKYCallingConv.td
@@ -79,4 +79,4 @@ def RetCC_CSKY_ABIV2_FP : CallingConv<[
   CCIfType<[i32], CCAssignToReg<[R0, R1]>>,
   CCIfType<[f32], CCAssignToReg<[F0_32]>>,
   CCIfType<[f64], CCAssignToReg<[F0_64]>>
-]>;
-\ No newline at end of file
+]>;
diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
index 9b22c95cfe21..3a8ee5713584 100644
--- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
@@ -54,4 +54,4 @@ void CSKYFrameLowering::emitPrologue(MachineFunction &MF,
 void CSKYFrameLowering::emitEpilogue(MachineFunction &MF,
                                      MachineBasicBlock &MBB) const {
   // FIXME: Implement this when we have function calls
-}
-\ No newline at end of file
+}
diff --git a/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp
index fc9ef8bfd9d9..8dc91904b8cc 100644
--- a/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp
@@ -40,6 +40,8 @@ public:
   }
 
   void Select(SDNode *N) override;
+  bool selectAddCarry(SDNode *N);
+  bool selectSubCarry(SDNode *N);
 
 #include "CSKYGenDAGISel.inc"
 };
@@ -60,7 +62,12 @@ void CSKYDAGToDAGISel::Select(SDNode *N) {
   switch (Opcode) {
   default:
     break;
-    // FIXME: Add selection nodes needed later.
+  case ISD::ADDCARRY:
+    IsSelected = selectAddCarry(N);
+    break;
+  case ISD::SUBCARRY:
+    IsSelected = selectSubCarry(N);
+    break;
   }
 
   if (IsSelected)
@@ -70,6 +77,86 @@ void CSKYDAGToDAGISel::Select(SDNode *N) {
   SelectCode(N);
 }
 
+bool CSKYDAGToDAGISel::selectAddCarry(SDNode *N) {
+  MachineSDNode *NewNode = nullptr;
+  auto Type0 = N->getValueType(0);
+  auto Type1 = N->getValueType(1);
+  auto Op0 = N->getOperand(0);
+  auto Op1 = N->getOperand(1);
+  auto Op2 = N->getOperand(2);
+
+  SDLoc Dl(N);
+
+  if (isNullConstant(Op2)) {
+    auto *CA = CurDAG->getMachineNode(
+        Subtarget->has2E3() ? CSKY::CLRC32 : CSKY::CLRC16, Dl, Type1);
+    NewNode = CurDAG->getMachineNode(
+        Subtarget->has2E3() ? CSKY::ADDC32 : CSKY::ADDC16, Dl, {Type0, Type1},
+        {Op0, Op1, SDValue(CA, 0)});
+  } else if (isOneConstant(Op2)) {
+    auto *CA = CurDAG->getMachineNode(
+        Subtarget->has2E3() ? CSKY::SETC32 : CSKY::SETC16, Dl, Type1);
+    NewNode = CurDAG->getMachineNode(
+        Subtarget->has2E3() ? CSKY::ADDC32 : CSKY::ADDC16, Dl, {Type0, Type1},
+        {Op0, Op1, SDValue(CA, 0)});
+  } else {
+    NewNode = CurDAG->getMachineNode(Subtarget->has2E3() ? CSKY::ADDC32
+                                                         : CSKY::ADDC16,
+                                     Dl, {Type0, Type1}, {Op0, Op1, Op2});
+  }
+  ReplaceNode(N, NewNode);
+  return true;
+}
+
+static SDValue InvertCarryFlag(const CSKYSubtarget *Subtarget,
+                               SelectionDAG *DAG, SDLoc Dl, SDValue OldCarry) {
+  auto NewCarryReg =
+      DAG->getMachineNode(Subtarget->has2E3() ? CSKY::MVCV32 : CSKY::MVCV16, Dl,
+                          MVT::i32, OldCarry);
+  auto NewCarry =
+      DAG->getMachineNode(Subtarget->hasE2() ? CSKY::BTSTI32 : CSKY::BTSTI16,
+                          Dl, OldCarry.getValueType(), SDValue(NewCarryReg, 0),
+                          DAG->getTargetConstant(0, Dl, MVT::i32));
+  return SDValue(NewCarry, 0);
+}
+
+bool CSKYDAGToDAGISel::selectSubCarry(SDNode *N) {
+  MachineSDNode *NewNode = nullptr;
+  auto Type0 = N->getValueType(0);
+  auto Type1 = N->getValueType(1);
+  auto Op0 = N->getOperand(0);
+  auto Op1 = N->getOperand(1);
+  auto Op2 = N->getOperand(2);
+
+  SDLoc Dl(N);
+
+  if (isNullConstant(Op2)) {
+    auto *CA = CurDAG->getMachineNode(
+        Subtarget->has2E3() ? CSKY::SETC32 : CSKY::SETC16, Dl, Type1);
+    NewNode = CurDAG->getMachineNode(
+        Subtarget->has2E3() ? CSKY::SUBC32 : CSKY::SUBC16, Dl, {Type0, Type1},
+        {Op0, Op1, SDValue(CA, 0)});
+  } else if (isOneConstant(Op2)) {
+    auto *CA = CurDAG->getMachineNode(
+        Subtarget->has2E3() ? CSKY::CLRC32 : CSKY::CLRC16, Dl, Type1);
+    NewNode = CurDAG->getMachineNode(
+        Subtarget->has2E3() ? CSKY::SUBC32 : CSKY::SUBC16, Dl, {Type0, Type1},
+        {Op0, Op1, SDValue(CA, 0)});
+  } else {
+    auto CarryIn = InvertCarryFlag(Subtarget, CurDAG, Dl, Op2);
+    NewNode = CurDAG->getMachineNode(Subtarget->has2E3() ? CSKY::SUBC32
+                                                         : CSKY::SUBC16,
+                                     Dl, {Type0, Type1}, {Op0, Op1, CarryIn});
+  }
+  auto CarryOut = InvertCarryFlag(Subtarget, CurDAG, Dl, SDValue(NewNode, 1));
+
+  ReplaceUses(SDValue(N, 0), SDValue(NewNode, 0));
+  ReplaceUses(SDValue(N, 1), CarryOut);
+  CurDAG->RemoveDeadNode(N);
+
+  return true;
+}
+
 FunctionPass *llvm::createCSKYISelDag(CSKYTargetMachine &TM) {
   return new CSKYDAGToDAGISel(TM);
 }
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index ac6d069e592c..a1f7cc685d4c 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -37,6 +37,46 @@ CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM,
   // Register Class
   addRegisterClass(MVT::i32, &CSKY::GPRRegClass);
 
+  setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
+  setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::ROTR, MVT::i32, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::MULHS, MVT::i32, Expand);
+  setOperationAction(ISD::MULHU, MVT::i32, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::i32, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MVT::i1, Promote);
+
+  if (!Subtarget.hasE2()) {
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i16, Expand);
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  }
+
+  if (!Subtarget.has2E3()) {
+    setOperationAction(ISD::ABS, MVT::i32, Expand);
+    setOperationAction(ISD::BITREVERSE, MVT::i32, Expand);
+    setOperationAction(ISD::SDIV, MVT::i32, Expand);
+    setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  }
+
   // Compute derived properties from the register classes.
   computeRegisterProperties(STI.getRegisterInfo());
 
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td
index 6d42bddcdd78..ea0761d97545 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td
@@ -88,6 +88,19 @@ class R16_XZ_UNOP<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst<
   let Inst{1, 0} = sop;
 }
 
+class R16_Z_UNOP<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst<
+  AddrModeNone, (outs sGPR:$rz), (ins sGPR:$rx), !strconcat(opstr, "\t$rz"),
+  []> {
+  bits<4> rz;
+  bits<4> rx;
+  let Inst{15, 14} = 0b01;
+  let Inst{13 - 10} = op;
+  let Inst{9 - 6} = rz;
+  let Inst{5 - 2} = rx;
+  let Inst{1, 0} = sop;
+  let Constraints = "$rz = $rx";
+}
+
 class R16_XY_CMP<bits<2> sop, string opstr> : CSKY16Inst<
   AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry), !strconcat(opstr, "\t$rx, $ry"),
   []> {
@@ -146,7 +159,7 @@ class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand> : CSKY16Inst<
 }
 
 class I16_SP_IMM7<bits<3> sop, string opstr> : CSKY16Inst<
-  AddrModeNone, (outs SPOp:$sp2), (ins SPOp:$sp1, uimm7_2:$imm7),
+  AddrModeNone, (outs GPRSP:$sp2), (ins GPRSP:$sp1, uimm7_2:$imm7),
   !strconcat(opstr, "\t$sp2, $sp1, $imm7"), []> {
   bits<7> imm7;
   let Inst{15, 14} = 0b00;
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
index e12235cf9478..6fcb136cd99b 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -11,6 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "CSKYInstrInfo.h"
+#include "CSKYMachineFunctionInfo.h"
+#include "CSKYTargetMachine.h"
 #include "llvm/MC/MCContext.h"
 
 #define DEBUG_TYPE "csky-instr-info"
@@ -23,3 +25,289 @@ using namespace llvm;
 CSKYInstrInfo::CSKYInstrInfo(CSKYSubtarget &STI)
     : CSKYGenInstrInfo(CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) {
 }
+
+Register CSKYInstrInfo::movImm(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               const DebugLoc &DL, int64_t Val,
+                               MachineInstr::MIFlag Flag) const {
+  assert(isUInt<32>(Val) && "should be uint32");
+
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  Register DstReg;
+  if (STI.hasE2()) {
+    DstReg = MRI.createVirtualRegister(&CSKY::GPRRegClass);
+
+    if (isUInt<16>(Val)) {
+      BuildMI(MBB, MBBI, DL, get(CSKY::MOVI32), DstReg)
+          .addImm(Val & 0xFFFF)
+          .setMIFlags(Flag);
+    } else if (isShiftedUInt<16, 16>(Val)) {
+      BuildMI(MBB, MBBI, DL, get(CSKY::MOVIH32), DstReg)
+          .addImm((Val >> 16) & 0xFFFF)
+          .setMIFlags(Flag);
+    } else {
+      BuildMI(MBB, MBBI, DL, get(CSKY::MOVIH32), DstReg)
+          .addImm((Val >> 16) & 0xFFFF)
+          .setMIFlags(Flag);
+      BuildMI(MBB, MBBI, DL, get(CSKY::ORI32), DstReg)
+          .addReg(DstReg)
+          .addImm(Val & 0xFFFF)
+          .setMIFlags(Flag);
+    }
+
+  } else {
+    DstReg = MRI.createVirtualRegister(&CSKY::mGPRRegClass);
+    if (isUInt<8>(Val)) {
+      BuildMI(MBB, MBBI, DL, get(CSKY::MOVI16), DstReg)
+          .addImm(Val & 0xFF)
+          .setMIFlags(Flag);
+    } else if (isUInt<16>(Val)) {
+      BuildMI(MBB, MBBI, DL, get(CSKY::MOVI16), DstReg)
+          .addImm((Val >> 8) & 0xFF)
+          .setMIFlags(Flag);
+      BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg)
+          .addReg(DstReg)
+          .addImm(8)
+          .setMIFlags(Flag);
+      if ((Val & 0xFF) != 0)
+        BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg)
+            .addReg(DstReg)
+            .addImm(Val & 0xFF)
+            .setMIFlags(Flag);
+    } else if (isUInt<24>(Val)) {
+      BuildMI(MBB, MBBI, DL, get(CSKY::MOVI16), DstReg)
+          .addImm((Val >> 16) & 0xFF)
+          .setMIFlags(Flag);
+      BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg)
+          .addReg(DstReg)
+          .addImm(8)
+          .setMIFlags(Flag);
+      if (((Val >> 8) & 0xFF) != 0)
+        BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg)
+            .addReg(DstReg)
+            .addImm((Val >> 8) & 0xFF)
+            .setMIFlags(Flag);
+      BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg)
+          .addReg(DstReg)
+          .addImm(8)
+          .setMIFlags(Flag);
+      if ((Val & 0xFF) != 0)
+        BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg)
+            .addReg(DstReg)
+            .addImm(Val & 0xFF)
+            .setMIFlags(Flag);
+    } else {
+      BuildMI(MBB, MBBI, DL, get(CSKY::MOVI16), DstReg)
+          .addImm((Val >> 24) & 0xFF)
+          .setMIFlags(Flag);
+      BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg)
+          .addReg(DstReg)
+          .addImm(8)
+          .setMIFlags(Flag);
+      if (((Val >> 16) & 0xFF) != 0)
+        BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg)
+            .addReg(DstReg)
+            .addImm((Val >> 16) & 0xFF)
+            .setMIFlags(Flag);
+      BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg)
+          .addReg(DstReg)
+          .addImm(8)
+          .setMIFlags(Flag);
+      if (((Val >> 8) & 0xFF) != 0)
+        BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg)
+            .addReg(DstReg)
+            .addImm((Val >> 8) & 0xFF)
+            .setMIFlags(Flag);
+      BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg)
+          .addReg(DstReg)
+          .addImm(8)
+          .setMIFlags(Flag);
+      if ((Val & 0xFF) != 0)
+        BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg)
+            .addReg(DstReg)
+            .addImm(Val & 0xFF)
+            .setMIFlags(Flag);
+    }
+  }
+
+  return DstReg;
+}
+
+unsigned CSKYInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                            int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  default:
+    return 0;
+  case CSKY::LD16B:
+  case CSKY::LD16H:
+  case CSKY::LD16W:
+  case CSKY::LD32B:
+  case CSKY::LD32BS:
+  case CSKY::LD32H:
+  case CSKY::LD32HS:
+  case CSKY::LD32W:
+  case CSKY::RESTORE_CARRY:
+    break;
+  }
+
+  if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+      MI.getOperand(2).getImm() == 0) {
+    FrameIndex = MI.getOperand(1).getIndex();
+    return MI.getOperand(0).getReg();
+  }
+
+  return 0;
+}
+
+unsigned CSKYInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                           int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  default:
+    return 0;
+  case CSKY::ST16B:
+  case CSKY::ST16H:
+  case CSKY::ST16W:
+  case CSKY::ST32B:
+  case CSKY::ST32H:
+  case CSKY::ST32W:
+  case CSKY::SPILL_CARRY:
+    break;
+  }
+
+  if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+      MI.getOperand(2).getImm() == 0) {
+    FrameIndex = MI.getOperand(1).getIndex();
+    return MI.getOperand(0).getReg();
+  }
+
+  return 0;
+}
+
+void CSKYInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        Register SrcReg, bool IsKill, int FI,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  unsigned Opcode = 0;
+
+  if (CSKY::GPRRegClass.hasSubClassEq(RC)) {
+    Opcode = CSKY::ST32W; // Optimize for 16bit
+  } else if (CSKY::CARRYRegClass.hasSubClassEq(RC)) {
+    Opcode = CSKY::SPILL_CARRY;
+    CFI->setSpillsCR();
+  } else {
+    llvm_unreachable("Unknown RegisterClass");
+  }
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+
+  BuildMI(MBB, I, DL, get(Opcode))
+      .addReg(SrcReg, getKillRegState(IsKill))
+      .addFrameIndex(FI)
+      .addImm(0)
+      .addMemOperand(MMO);
+}
+
+void CSKYInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         Register DestReg, int FI,
+                                         const TargetRegisterClass *RC,
+                                         const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  unsigned Opcode = 0;
+
+  if (CSKY::GPRRegClass.hasSubClassEq(RC)) {
+    Opcode = CSKY::LD32W;
+  } else if (CSKY::CARRYRegClass.hasSubClassEq(RC)) {
+    Opcode = CSKY::RESTORE_CARRY;
+    CFI->setSpillsCR();
+  } else {
+    llvm_unreachable("Unknown RegisterClass");
+  }
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+
+  BuildMI(MBB, I, DL, get(Opcode), DestReg)
+      .addFrameIndex(FI)
+      .addImm(0)
+      .addMemOperand(MMO);
+}
+
+void CSKYInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I,
+                                const DebugLoc &DL, MCRegister DestReg,
+                                MCRegister SrcReg, bool KillSrc) const {
+
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  if (CSKY::GPRRegClass.contains(SrcReg) &&
+      CSKY::CARRYRegClass.contains(DestReg)) {
+    if (STI.hasE2()) {
+      BuildMI(MBB, I, DL, get(CSKY::BTSTI32), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0);
+    } else {
+      assert(SrcReg < CSKY::R8);
+      BuildMI(MBB, I, DL, get(CSKY::BTSTI16), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0);
+    }
+    return;
+  }
+
+  if (CSKY::CARRYRegClass.contains(SrcReg) &&
+      CSKY::GPRRegClass.contains(DestReg)) {
+
+    if (STI.hasE2()) {
+      BuildMI(MBB, I, DL, get(CSKY::MVC32), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      assert(DestReg < CSKY::R16);
+      assert(DestReg < CSKY::R8);
+      BuildMI(MBB, I, DL, get(CSKY::MOVI16), DestReg).addImm(0);
+      BuildMI(MBB, I, DL, get(CSKY::ADDC16))
+          .addReg(DestReg, RegState::Define)
+          .addReg(SrcReg, RegState::Define)
+          .addReg(DestReg, getKillRegState(true))
+          .addReg(DestReg, getKillRegState(true))
+          .addReg(SrcReg, getKillRegState(true));
+      BuildMI(MBB, I, DL, get(CSKY::BTSTI16))
+          .addReg(SrcReg, RegState::Define | getDeadRegState(KillSrc))
+          .addReg(DestReg)
+          .addImm(0);
+    }
+    return;
+  }
+
+  unsigned Opcode = 0;
+  if (CSKY::GPRRegClass.contains(DestReg, SrcReg))
+    Opcode = CSKY::MOV32;
+  else {
+    LLVM_DEBUG(dbgs() << "src = " << SrcReg << ", dst = " << DestReg);
+    LLVM_DEBUG(I->dump());
+    llvm_unreachable("Unknown RegisterClass");
+  }
+
+  BuildMI(MBB, I, DL, get(Opcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+}
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
index 04be9da27b57..450641d96b74 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
@@ -29,6 +29,31 @@ protected:
 
 public:
   explicit CSKYInstrInfo(CSKYSubtarget &STI);
+
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, Register SrcReg,
+                           bool IsKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI, Register DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
+                   bool KillSrc) const override;
+
+  // Materializes the given integer Val into DstReg.
+  Register movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  const DebugLoc &DL, int64_t Val,
+                  MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
index 9dda3159e446..30d9206eec68 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
@@ -52,6 +52,11 @@ class OImmAsmOperand<int width, string suffix = "">
     : ImmAsmOperand<"O", width, suffix> {
 }
 
+def to_tframeindex : SDNodeXForm<frameindex, [{
+  auto FI = cast<FrameIndexSDNode>(N);
+  return CurDAG->getTargetFrameIndex(FI->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout()));
+}]>;
+
 class oimm<int num> : Operand<i32>,
   ImmLeaf<i32, "return isUInt<"#num#">(Imm - 1);"> {
   let EncoderMethod = "getOImmOpValue";
@@ -166,9 +171,23 @@ def bare_symbol : Operand<iPTR> {
   let OperandType = "OPERAND_PCREL";
 }
 
-def oimm3 : oimm<3>;
+def oimm3 : oimm<3> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isUInt<3>(Imm - 1);
+    return MCOp.isBareSymbolRef();
+  }];
+}
 def oimm4 : oimm<4>;
-def oimm5 : oimm<5>;
+def oimm5 : oimm<5> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isUInt<5>(Imm - 1);
+    return MCOp.isBareSymbolRef();
+  }];
+}
 def oimm6 : oimm<6>;
 
 def imm5_idly : Operand<i32>, ImmLeaf<i32,
@@ -177,9 +196,30 @@ def imm5_idly : Operand<i32>, ImmLeaf<i32,
   let DecoderMethod = "decodeOImmOperand<5>";
 }
 
-def oimm8 : oimm<8>;
-def oimm12 : oimm<12>;
-def oimm16 : oimm<16>;
+def oimm8 : oimm<8> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isUInt<8>(Imm - 1);
+    return MCOp.isBareSymbolRef();
+  }];
+}
+def oimm12 : oimm<12> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isUInt<12>(Imm - 1);
+    return MCOp.isBareSymbolRef();
+  }];
+}
+def oimm16 : oimm<16> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isUInt<16>(Imm - 1);
+    return MCOp.isBareSymbolRef();
+  }];
+}
 
 def nimm12 : nimm<12>;
 
@@ -195,28 +235,98 @@ def uimm2_jmpix : Operand<i32>,
 
 def uimm3 : uimm<3>;
 def uimm4 : uimm<4>;
-def uimm5 : uimm<5>;
+def uimm5 : uimm<5> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedUInt<5, 0>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
 def uimm5_msb_size : uimm<5> {
   let EncoderMethod = "getImmOpValueMSBSize";
 }
 
-def uimm5_1 : uimm<5, 1>;
-def uimm5_2 : uimm<5, 2>;
+def uimm5_1 : uimm<5, 1> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedUInt<5, 1>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
+def uimm5_2 : uimm<5, 2> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedUInt<5, 2>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
 def uimm6 : uimm<6>;
 def uimm7 : uimm<7>;
 def uimm7_1 : uimm<7, 1>;
-def uimm7_2 : uimm<7, 2>;
+def uimm7_2 : uimm<7, 2>{
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedUInt<7, 2>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
 def uimm7_3 : uimm<7, 3>;
-def uimm8 : uimm<8>;
-def uimm8_2 : uimm<8, 2>;
+def uimm8 : uimm<8> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedUInt<8, 0>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
+def uimm8_2 : uimm<8, 2> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedUInt<8, 2>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
 def uimm8_3 : uimm<8, 3>;
 def uimm8_8 : uimm<8, 8>;
 def uimm8_16 : uimm<8, 16>;
 def uimm8_24 : uimm<8, 24>;
-def uimm12 : uimm<12>;
-def uimm12_1 : uimm<12, 1>;
-def uimm12_2 : uimm<12, 2>;
-def uimm16 : uimm<16>;
+def uimm12 : uimm<12>  {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedUInt<12, 0>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
+def uimm12_1 : uimm<12, 1> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedUInt<12, 1>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
+def uimm12_2 : uimm<12, 2> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedUInt<12, 2>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
+def uimm16 : uimm<16> {
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedUInt<16, 0>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
 def uimm16_8 : uimm<16, 8>;
 def uimm16_16 : uimm<16, 16>;
 def uimm20 : uimm<20>;
@@ -642,11 +752,6 @@ def BSR32_BR : J<0x38, (outs), (ins call_symbol:$offset), "bsr32", []>{
   let Defs = [ R15 ];
 }
 
-let Predicates = [iHasE2], isCodeGenOnly = 1 in {
-  def RTS32 : I_16_RET<0x6, 0xF, "rts32", [(CSKY_RET)]>;
-}
-
-
 //===----------------------------------------------------------------------===//
 // Symbol address instructions.
 //===----------------------------------------------------------------------===//
@@ -872,6 +977,102 @@ def TRAP32 : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins uimm2:$imm2), "trap32 $
 
 }
 
+//===----------------------------------------------------------------------===//
+// Instruction Patterns.
+//===----------------------------------------------------------------------===//
+
+// Load & Store Patterns
+multiclass LdPat<PatFrag LoadOp, ImmLeaf imm_type, Instruction Inst, ValueType Type> {
+  def : Pat<(Type (LoadOp GPR:$rs1)), (Inst GPR:$rs1, 0)>;
+  def : Pat<(Type (LoadOp (i32 frameindex:$rs1))), (Inst (i32 (to_tframeindex tframeindex:$rs1)), 0)>;
+  def : Pat<(Type (LoadOp (add GPR:$rs1, imm_type:$uimm))),
+            (Inst GPR:$rs1, imm_type:$uimm)>;
+  def : Pat<(Type (LoadOp (add frameindex:$rs1, imm_type:$uimm))),
+            (Inst (i32 (to_tframeindex tframeindex:$rs1)), imm_type:$uimm)>;
+  def : Pat<(Type (LoadOp (eqToAdd frameindex:$rs1, imm_type:$uimm))),
+            (Inst (i32 (to_tframeindex tframeindex:$rs1)), imm_type:$uimm)>;
+  def : Pat<(Type (LoadOp (add GPR:$rs1, tglobaladdr:$gd))),
+            (Inst GPR:$rs1, tglobaladdr:$gd)>;
+}
+
+defm : LdPat<extloadi8, uimm12, LD32B, i32>;
+defm : LdPat<zextloadi8, uimm12, LD32B, i32>;
+let Predicates = [iHasE2] in {
+  defm : LdPat<sextloadi8, uimm12, LD32BS, i32>;
+}
+defm : LdPat<extloadi16, uimm12_1, LD32H, i32>;
+defm : LdPat<zextloadi16, uimm12_1, LD32H, i32>;
+let Predicates = [iHasE2] in {
+defm : LdPat<sextloadi16, uimm12_1, LD32HS, i32>;
+}
+defm : LdPat<load, uimm12_2, LD32W, i32>;
+
+multiclass LdrPat<PatFrag LoadOp, Instruction Inst, ValueType Type> {
+  def : Pat<(Type (LoadOp (add GPR:$rs1, GPR:$rs2))), (Inst GPR:$rs1, GPR:$rs2, 0)>;
+  def : Pat<(Type (LoadOp (add GPR:$rs1, (shl GPR:$rs2, (i32 1))))), (Inst GPR:$rs1, GPR:$rs2, 1)>;
+  def : Pat<(Type (LoadOp (add GPR:$rs1, (shl GPR:$rs2, (i32 2))))), (Inst GPR:$rs1, GPR:$rs2, 2)>;
+  def : Pat<(Type (LoadOp (add GPR:$rs1, (shl GPR:$rs2, (i32 3))))), (Inst GPR:$rs1, GPR:$rs2, 3)>;
+}
+
+let Predicates = [iHas2E3] in {
+  defm : LdrPat<zextloadi8, LDR32B, i32>;
+  defm : LdrPat<sextloadi8, LDR32BS, i32>;
+  defm : LdrPat<extloadi8, LDR32BS, i32>;
+  defm : LdrPat<zextloadi16, LDR32H, i32>;
+  defm : LdrPat<sextloadi16, LDR32HS, i32>;
+  defm : LdrPat<extloadi16, LDR32HS, i32>;
+  defm : LdrPat<load, LDR32W, i32>;
+}
+
+multiclass StPat<PatFrag StoreOp, ValueType Type, ImmLeaf imm_type, Instruction Inst> {
+  def : Pat<(StoreOp Type:$rs2, GPR:$rs1), (Inst Type:$rs2, GPR:$rs1, 0)>;
+  def : Pat<(StoreOp Type:$rs2, frameindex:$rs1), (Inst Type:$rs2, (i32 (to_tframeindex tframeindex:$rs1)), 0)>;
+  def : Pat<(StoreOp Type:$rs2, (add GPR:$rs1, imm_type:$uimm12)),
+            (Inst Type:$rs2, GPR:$rs1, imm_type:$uimm12)>;
+  def : Pat<(StoreOp Type:$rs2, (add frameindex:$rs1, imm_type:$uimm12)),
+            (Inst Type:$rs2, (i32 (to_tframeindex tframeindex:$rs1)), imm_type:$uimm12)>;
+  def : Pat<(StoreOp Type:$rs2, (eqToAdd frameindex:$rs1, imm_type:$uimm12)),
+            (Inst Type:$rs2, (i32 (to_tframeindex tframeindex:$rs1)), imm_type:$uimm12)>;
+}
+
+defm : StPat<truncstorei8, i32, uimm12, ST32B>;
+defm : StPat<truncstorei16, i32, uimm12_1, ST32H>;
+defm : StPat<store, i32, uimm12_2, ST32W>;
+
+multiclass StrPat<PatFrag StoreOp, ValueType Type, Instruction Inst> {
+  def : Pat<(StoreOp Type:$rz, (add GPR:$rs1, GPR:$rs2)), (Inst Type:$rz, GPR:$rs1, GPR:$rs2, 0)>;
+  def : Pat<(StoreOp Type:$rz, (add GPR:$rs1, (shl GPR:$rs2, (i32 1)))), (Inst Type:$rz, GPR:$rs1, GPR:$rs2, 1)>;
+  def : Pat<(StoreOp Type:$rz, (add GPR:$rs1, (shl GPR:$rs2, (i32 2)))), (Inst Type:$rz, GPR:$rs1, GPR:$rs2, 2)>;
+  def : Pat<(StoreOp Type:$rz, (add GPR:$rs1, (shl GPR:$rs2, (i32 3)))), (Inst Type:$rz, GPR:$rs1, GPR:$rs2, 3)>;
+}
+
+let Predicates = [iHas2E3] in {
+  defm : StrPat<truncstorei8, i32, STR32B>;
+  defm : StrPat<truncstorei16, i32, STR32H>;
+  defm : StrPat<store, i32, STR32W>;
+
+  // Sext & Zext Patterns
+  def : Pat<(sext_inreg GPR:$src, i1), (SEXT32 GPR:$src, 0, 0)>;
+  def : Pat<(and GPR:$src, 255), (ZEXT32 GPR:$src, 7, 0)>;
+  def : Pat<(and GPR:$src, 65535), (ZEXT32 GPR:$src, 15, 0)>;
+}
+
+// Constant materialize patterns.
+let Predicates = [iHasE2] in
+  def : Pat<(i32 imm:$imm),
+            (ORI32 (MOVIH32 (uimm32_hi16 imm:$imm)), (uimm32_lo16 imm:$imm))>;
+
+
+// Other operations.
+let Predicates = [iHasE2] in {
+  def : Pat<(rotl GPR:$rs1, GPR:$rs2),
+            (ROTL32 GPR:$rs1, (ANDI32 GPR:$rs2, 0x1f))>;
+  let Predicates = [iHas2E3] in {
+    def : Pat<(bitreverse GPR:$rx), (BREV32 GPR:$rx)>;
+    def : Pat<(bswap GPR:$rx), (REVB32 GPR:$rx)>;
+  }
+  def : Pat<(i32 (ctlz GPR:$rx)), (FF1 GPR:$rx)>;
+}
 
 //===----------------------------------------------------------------------===//
 // Pseudo for assembly
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
index c98f43622155..6a9dd03dfa1d 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td
@@ -33,16 +33,6 @@ def br_symbol_16bit : Operand<iPTR> {
   let OperandType = "OPERAND_PCREL";
 }
 
-def SPOperand : AsmOperandClass {
-  let Name = "SPOperand";
-  let RenderMethod = "addRegOperands";
-  let DiagnosticType = !strconcat("Invalid", Name);
-}
-
-def SPOp : RegisterOperand<GPR> {
- let ParserMatchClass = SPOperand;
-}
-
 def constpool_symbol_16bit : Operand<iPTR> {
   let ParserMatchClass = Constpool;
   let EncoderMethod =
@@ -83,7 +73,7 @@ let isCommutable = 1 in {
   def XOR16 : R16_XZ_BINOP<0b1011, 0b01, "xor16", BinOpFrag<(xor node:$LHS, node:$RHS)>>;
   def NOR16 : R16_XZ_BINOP<0b1011, 0b10, "nor16", BinOpFrag<(not (or node:$LHS, node:$RHS))>>;
   let isCodeGenOnly = 1 in
-  def NOT16 : R16_XZ_UNOP<0b1011, 0b10, "not16">;
+  def NOT16 : R16_Z_UNOP<0b1011, 0b10, "not16">;
   def MULT16 :  R16_XZ_BINOP<0b1111, 0b00, "mult16", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 }
 def SUBU16XZ : R16_XZ_BINOP<0b1000, 0b10, "subu16", BinOpFrag<(sub node:$LHS, node:$RHS)>>;
@@ -108,7 +98,7 @@ let Constraints = "$rZ = $rz", isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 }
 
 let isAdd = 1 in
-def ADDI16ZSP : I16_Z_8<0b011, (ins SPOp:$sp, uimm8_2:$imm8),
+def ADDI16ZSP : I16_Z_8<0b011, (ins GPRSP:$sp, uimm8_2:$imm8),
                         "addi16\t$rz, $sp, $imm8">;
 
 let isAdd = 1 in
@@ -150,9 +140,9 @@ def ST16W : I16_XZ_LDST<AddrMode16W, 0b110, "st16.w",
   (outs), (ins mGPR:$rz, mGPR:$rx, uimm5_2:$imm)>;
 
 def LD16WSP : I16_ZSP_LDST<AddrMode16W, 0b011, "ld16.w",
-  (outs mGPR:$rz), (ins SPOp:$sp, uimm8_2:$addr)>;
+  (outs mGPR:$rz), (ins GPRSP:$sp, uimm8_2:$addr)>;
 def ST16WSP : I16_ZSP_LDST<AddrMode16W, 0b111, "st16.w",
-  (outs), (ins mGPR:$rz, SPOp:$sp, uimm8_2:$addr)>;
+  (outs), (ins mGPR:$rz, GPRSP:$sp, uimm8_2:$addr)>;
 
 //===----------------------------------------------------------------------===//
 // Compare instructions.
@@ -450,3 +440,150 @@ def JBF16 : JBranchPseudo<(outs),
 let mayLoad = 1, Size = 2, isCodeGenOnly = 0 in
 def PseudoLRW16 : CSKYPseudo<(outs mGPR:$rz),
   (ins bare_symbol:$src), "lrw16 $rz, $src", []>;
+
+
+//===----------------------------------------------------------------------===//
+// Compress Instruction tablegen backend.
+//===----------------------------------------------------------------------===//
+
+def : CompressPat<(ADDU32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (ADDU16XZ sGPR:$rd, sGPR:$rs2)>;
+def : CompressPat<(ADDU32 sGPR:$rd, sGPR:$rs1, sGPR:$rd),
+                  (ADDU16XZ sGPR:$rd, sGPR:$rs1)>;
+def : CompressPat<(ADDU32 mGPR:$rd, mGPR:$rs1, mGPR:$rs2),
+                  (ADDU16 mGPR:$rd, mGPR:$rs1, mGPR:$rs2)>;
+def : CompressPat<(SUBU32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (SUBU16XZ sGPR:$rd, sGPR:$rs2)>;
+def : CompressPat<(SUBU32 mGPR:$rd, mGPR:$rs1, mGPR:$rs2),
+                  (SUBU16 mGPR:$rd, mGPR:$rs1, mGPR:$rs2)>;
+
+def : CompressPat<
+  (ADDC32 sGPR:$rd, CARRY:$cout, sGPR:$rd, sGPR:$rs2, CARRY:$cout),
+  (ADDC16 sGPR:$rd, CARRY:$cout, sGPR:$rs2, CARRY:$cout)
+  >;
+def : CompressPat<
+  (SUBC32 sGPR:$rd, CARRY:$cout, sGPR:$rd, sGPR:$rs2, CARRY:$cout),
+  (SUBC16 sGPR:$rd, CARRY:$cout, sGPR:$rs2, CARRY:$cout)
+  >;
+
+def : CompressPat<(ADDI32 mGPR:$rd, mGPR:$rs, oimm3:$imm),
+                  (ADDI16XZ mGPR:$rd, mGPR:$rs, oimm3:$imm)>;
+def : CompressPat<(SUBI32 mGPR:$rd, mGPR:$rs, oimm3:$imm),
+                  (SUBI16XZ mGPR:$rd, mGPR:$rs, oimm3:$imm)>;
+
+def : CompressPat<(ADDI32 mGPR:$rd, mGPR:$rd, oimm8:$imm),
+                  (ADDI16 mGPR:$rd, oimm8:$imm)>;
+def : CompressPat<(SUBI32 mGPR:$rd, mGPR:$rd, oimm8:$imm),
+                  (SUBI16 mGPR:$rd, oimm8:$imm)>;
+
+def : CompressPat<(ADDI32 GPRSP:$sp, GPRSP:$sp, uimm7_2:$imm),
+                  (ADDI16SPSP GPRSP:$sp, GPRSP:$sp, uimm7_2:$imm)>;
+def : CompressPat<(SUBI32 GPRSP:$sp, GPRSP:$sp, uimm7_2:$imm),
+                  (SUBI16SPSP GPRSP:$sp, GPRSP:$sp, uimm7_2:$imm)>;
+
+def : CompressPat<(ADDI32 mGPR:$rd, GPRSP:$sp, uimm8_2:$imm),
+                  (ADDI16ZSP mGPR:$rd, GPRSP:$sp, uimm8_2:$imm)>;
+
+def : CompressPat<(MULT32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (MULT16 sGPR:$rd, sGPR:$rs2)>;
+def : CompressPat<(MULT32 sGPR:$rd, sGPR:$rs1, sGPR:$rd),
+                  (MULT16 sGPR:$rd, sGPR:$rs1)>;
+def : CompressPat<(AND32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (AND16 sGPR:$rd, sGPR:$rs2)>;
+def : CompressPat<(AND32 sGPR:$rd, sGPR:$rs1, sGPR:$rd),
+                  (AND16 sGPR:$rd, sGPR:$rs1)>;
+def : CompressPat<(OR32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (OR16 sGPR:$rd, sGPR:$rs2)>;
+def : CompressPat<(OR32 sGPR:$rd, sGPR:$rs1, sGPR:$rd),
+                  (OR16 sGPR:$rd, sGPR:$rs1)>;
+def : CompressPat<(XOR32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (XOR16 sGPR:$rd, sGPR:$rs2)>;
+def : CompressPat<(XOR32 sGPR:$rd, sGPR:$rs1, sGPR:$rd),
+                  (XOR16 sGPR:$rd, sGPR:$rs1)>;
+
+def : CompressPat<(ANDN32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (ANDN16 sGPR:$rd, sGPR:$rs2)>;
+def : CompressPat<(NOR32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (NOR16 sGPR:$rd, sGPR:$rs2)>;
+def : CompressPat<(LSL32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (LSL16 sGPR:$rd, sGPR:$rs2)>;
+def : CompressPat<(LSR32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (LSR16 sGPR:$rd, sGPR:$rs2)>;
+def : CompressPat<(ASR32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (ASR16 sGPR:$rd, sGPR:$rs2)>;
+def : CompressPat<(ROTL32 sGPR:$rd, sGPR:$rd, sGPR:$rs2),
+                  (ROTL16 sGPR:$rd, sGPR:$rs2)>;
+
+def : CompressPat<(NOT32 sGPR:$rd, sGPR:$rd),
+                  (NOT16 sGPR:$rd)>;
+
+let Predicates = [iHas2E3] in
+def : CompressPat<(REVB32 sGPR:$rd, sGPR:$rs),
+                  (REVB16 sGPR:$rd, sGPR:$rs)>;
+
+def : CompressPat<(LSLI32 mGPR:$rd, mGPR:$rs, uimm5:$imm),
+                  (LSLI16 mGPR:$rd, mGPR:$rs, uimm5:$imm)>;
+def : CompressPat<(LSRI32 mGPR:$rd, mGPR:$rs, uimm5:$imm),
+                  (LSRI16 mGPR:$rd, mGPR:$rs, uimm5:$imm)>;
+def : CompressPat<(ASRI32 mGPR:$rd, mGPR:$rs, uimm5:$imm),
+                  (ASRI16 mGPR:$rd, mGPR:$rs, uimm5:$imm)>;
+
+def : CompressPat<(CMPHS32 CARRY:$ca, sGPR:$rs1, sGPR:$rs2),
+                  (CMPHS16 CARRY:$ca, sGPR:$rs1, sGPR:$rs2)>;
+def : CompressPat<(CMPLT32 CARRY:$ca, sGPR:$rs1, sGPR:$rs2),
+                  (CMPLT16 CARRY:$ca, sGPR:$rs1, sGPR:$rs2)>;
+def : CompressPat<(CMPNE32 CARRY:$ca, sGPR:$rs1, sGPR:$rs2),
+                  (CMPNE16 CARRY:$ca, sGPR:$rs1, sGPR:$rs2)>;
+
+def : CompressPat<(CMPHSI32 CARRY:$ca, mGPR:$rs, oimm5:$imm),
+                  (CMPHSI16 CARRY:$ca, mGPR:$rs, oimm5:$imm)>;
+def : CompressPat<(CMPLTI32 CARRY:$ca, mGPR:$rs, oimm5:$imm),
+                  (CMPLTI16 CARRY:$ca, mGPR:$rs, oimm5:$imm)>;
+def : CompressPat<(CMPNEI32 CARRY:$ca, mGPR:$rs, uimm5:$imm),
+                  (CMPNEI16 CARRY:$ca, mGPR:$rs, uimm5:$imm)>;
+
+def : CompressPat<(JSR32 sGPR:$rd),
+                  (JSR16 sGPR:$rd)>;
+
+
+def : CompressPat<(MVCV32 sGPR:$rd, CARRY:$ca),
+                  (MVCV16 sGPR:$rd, CARRY:$ca)>;
+def : CompressPat<(MOV32 sGPR:$rd, sGPR:$ca),
+                  (MOV16 sGPR:$rd, sGPR:$ca)>;
+def : CompressPat<(MOVI32 mGPR:$rd, uimm8:$imm),
+                  (MOVI16 mGPR:$rd, uimm8:$imm)>;
+
+def : CompressPat<(LD32B mGPR:$rd, mGPR:$rs, uimm5:$imm),
+                  (LD16B mGPR:$rd, mGPR:$rs, uimm5:$imm)>;
+def : CompressPat<(LD32H mGPR:$rd, mGPR:$rs, uimm5_1:$imm),
+                  (LD16H mGPR:$rd, mGPR:$rs, uimm5_1:$imm)>;
+def : CompressPat<(LD32W mGPR:$rd, mGPR:$rs, uimm5_2:$imm),
+                  (LD16W mGPR:$rd, mGPR:$rs, uimm5_2:$imm)>;
+def : CompressPat<(LD32W mGPR:$rd, GPRSP:$sp, uimm8_2:$imm),
+                  (LD16WSP mGPR:$rd, GPRSP:$sp, uimm8_2:$imm)>;
+
+def : CompressPat<(ST32B mGPR:$rd, mGPR:$rs, uimm5:$imm),
+                  (ST16B mGPR:$rd, mGPR:$rs, uimm5:$imm)>;
+def : CompressPat<(ST32H mGPR:$rd, mGPR:$rs, uimm5_1:$imm),
+                  (ST16H mGPR:$rd, mGPR:$rs, uimm5_1:$imm)>;
+def : CompressPat<(ST32W mGPR:$rd, mGPR:$rs, uimm5_2:$imm),
+                  (ST16W mGPR:$rd, mGPR:$rs, uimm5_2:$imm)>;
+def : CompressPat<(ST32W mGPR:$rd, GPRSP:$sp, uimm8_2:$imm),
+                  (ST16WSP mGPR:$rd, GPRSP:$sp, uimm8_2:$imm)>;
+
+let Predicates = [HasBTST16] in
+def : CompressPat<(BTSTI32 CARRY:$ca, mGPR:$rs, uimm5:$imm),
+                  (BTSTI16 CARRY:$ca, mGPR:$rs, uimm5:$imm)>;
+def : CompressPat<(BCLRI32 mGPR:$rd, mGPR:$rd, uimm5:$imm),
+                  (BCLRI16 mGPR:$rd, uimm5:$imm)>;
+def : CompressPat<(BSETI32 mGPR:$rd, mGPR:$rd, uimm5:$imm),
+                  (BSETI16 mGPR:$rd, uimm5:$imm)>;
+
+def : CompressPat<(ZEXTB32 sGPR:$rd, sGPR:$rs),
+                  (ZEXTB16 sGPR:$rd, sGPR:$rs)>;
+def : CompressPat<(ZEXTH32 sGPR:$rd, sGPR:$rs),
+                  (ZEXTH16 sGPR:$rd, sGPR:$rs)>;
+def : CompressPat<(SEXTB32 sGPR:$rd, sGPR:$rs),
+                  (SEXTB16 sGPR:$rd, sGPR:$rs)>;
+def : CompressPat<(SEXTH32 sGPR:$rd, sGPR:$rs),
+                  (SEXTH16 sGPR:$rd, sGPR:$rs)>;
diff --git a/llvm/lib/Target/CSKY/CSKYMCInstLower.cpp b/llvm/lib/Target/CSKY/CSKYMCInstLower.cpp
index c42a56bfb04e..7e0b9bcd7549 100644
--- a/llvm/lib/Target/CSKY/CSKYMCInstLower.cpp
+++ b/llvm/lib/Target/CSKY/CSKYMCInstLower.cpp
@@ -114,4 +114,4 @@ bool CSKYMCInstLower::lowerOperand(const MachineOperand &MO,
     break;
   }
   return true;
-}
-\ No newline at end of file
+}
diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp b/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp
index a1d45fea534b..57b6ae3c27b5 100644
--- a/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp
@@ -88,8 +88,187 @@ CSKYRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return CSR_I32_SaveList;
 }
 
+static bool IsLegalOffset(const CSKYInstrInfo *TII, MachineInstr *MI,
+                          int &Offset) {
+  const MCInstrDesc &Desc = MI->getDesc();
+  unsigned AddrMode = (Desc.TSFlags & CSKYII::AddrModeMask);
+  unsigned i = 0;
+  for (; !MI->getOperand(i).isFI(); ++i) {
+    assert(i + 1 < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  if (MI->getOpcode() == CSKY::ADDI32) {
+    if (!isUInt<12>(std::abs(Offset) - 1))
+      return false;
+    if (Offset < 0) {
+      MI->setDesc(TII->get(CSKY::SUBI32));
+      Offset = -Offset;
+    }
+
+    return true;
+  }
+
+  if (MI->getOpcode() == CSKY::ADDI16XZ)
+    return false;
+
+  if (Offset < 0)
+    return false;
+
+  unsigned NumBits = 0;
+  unsigned Scale = 1;
+  switch (AddrMode) {
+  case CSKYII::AddrMode32B:
+    Scale = 1;
+    NumBits = 12;
+    break;
+  case CSKYII::AddrMode32H:
+    Scale = 2;
+    NumBits = 12;
+    break;
+  case CSKYII::AddrMode32WD:
+    Scale = 4;
+    NumBits = 12;
+    break;
+  case CSKYII::AddrMode16B:
+    Scale = 1;
+    NumBits = 5;
+    break;
+  case CSKYII::AddrMode16H:
+    Scale = 2;
+    NumBits = 5;
+    break;
+  case CSKYII::AddrMode16W:
+    Scale = 4;
+    NumBits = 5;
+    break;
+  case CSKYII::AddrMode32SDF:
+    Scale = 4;
+    NumBits = 8;
+    break;
+  default:
+    llvm_unreachable("Unsupported addressing mode!");
+  }
+
+  // Cannot encode offset.
+  if ((Offset & (Scale - 1)) != 0)
+    return false;
+
+  unsigned Mask = (1 << NumBits) - 1;
+  if ((unsigned)Offset <= Mask * Scale)
+    return true;
+
+  // Offset out of range.
+  return false;
+}
+
 void CSKYRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                            int SPAdj, unsigned FIOperandNum,
                                            RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected non-zero SPAdj value");
-}
-\ No newline at end of file
+
+  MachineInstr *MI = &*II;
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const CSKYInstrInfo *TII = MF.getSubtarget<CSKYSubtarget>().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  const CSKYSubtarget &STI = MF.getSubtarget<CSKYSubtarget>();
+
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case CSKY::RESTORE_CARRY: {
+    Register NewReg = STI.hasE2()
+                          ? MRI.createVirtualRegister(&CSKY::GPRRegClass)
+                          : MRI.createVirtualRegister(&CSKY::mGPRRegClass);
+
+    auto *Temp = BuildMI(MBB, II, DL, TII->get(CSKY::LD32W), NewReg)
+                     .add(MI->getOperand(1))
+                     .add(MI->getOperand(2))
+                     .getInstr();
+
+    BuildMI(MBB, II, DL, TII->get(STI.hasE2() ? CSKY::BTSTI32 : CSKY::BTSTI16),
+            MI->getOperand(0).getReg())
+        .addReg(NewReg, getKillRegState(true))
+        .addImm(0);
+
+    MI = Temp;
+
+    MBB.erase(II);
+    break;
+  }
+  case CSKY::SPILL_CARRY: {
+    Register NewReg;
+    if (STI.hasE2()) {
+      NewReg = MRI.createVirtualRegister(&CSKY::GPRRegClass);
+      BuildMI(MBB, II, DL, TII->get(CSKY::MVC32), NewReg)
+          .add(MI->getOperand(0));
+    } else {
+      NewReg = MRI.createVirtualRegister(&CSKY::mGPRRegClass);
+      BuildMI(MBB, II, DL, TII->get(CSKY::MOVI16), NewReg).addImm(0);
+      BuildMI(MBB, II, DL, TII->get(CSKY::ADDC16))
+          .addReg(NewReg, RegState::Define)
+          .addReg(MI->getOperand(0).getReg(), RegState::Define)
+          .addReg(NewReg, getKillRegState(true))
+          .addReg(NewReg, getKillRegState(true))
+          .addReg(MI->getOperand(0).getReg());
+
+      BuildMI(MBB, II, DL, TII->get(CSKY::BTSTI16), MI->getOperand(0).getReg())
+          .addReg(NewReg)
+          .addImm(0);
+    }
+
+    MI = BuildMI(MBB, II, DL, TII->get(CSKY::ST32W))
+             .addReg(NewReg, getKillRegState(true))
+             .add(MI->getOperand(1))
+             .add(MI->getOperand(2))
+             .getInstr();
+
+    MBB.erase(II);
+
+    break;
+  }
+  }
+
+  int FrameIndex = MI->getOperand(FIOperandNum).getIndex();
+  Register FrameReg;
+  int Offset = getFrameLowering(MF)
+                   ->getFrameIndexReference(MF, FrameIndex, FrameReg)
+                   .getFixed() +
+               MI->getOperand(FIOperandNum + 1).getImm();
+
+  if (!isInt<32>(Offset))
+    report_fatal_error(
+        "Frame offsets outside of the signed 32-bit range not supported");
+
+  bool FrameRegIsKill = false;
+  MachineBasicBlock::iterator NewII(MI);
+  if (!IsLegalOffset(TII, MI, Offset)) {
+    assert(isInt<32>(Offset) && "Int32 expected");
+    // The offset won't fit in an immediate, so use a scratch register instead
+    // Modify Offset and FrameReg appropriately
+    assert(Offset >= 0);
+    Register ScratchReg = TII->movImm(MBB, NewII, DL, Offset);
+    BuildMI(MBB, NewII, DL,
+            TII->get(STI.hasE2() ? CSKY::ADDU32 : CSKY::ADDU16XZ), ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .addReg(FrameReg);
+
+    Offset = 0;
+    FrameReg = ScratchReg;
+    FrameRegIsKill = true;
+  }
+
+  if (Offset == 0 &&
+      (MI->getOpcode() == CSKY::ADDI32 || MI->getOpcode() == CSKY::ADDI16XZ)) {
+    MI->setDesc(TII->get(TargetOpcode::COPY));
+    MI->getOperand(FIOperandNum)
+        .ChangeToRegister(FrameReg, false, false, FrameRegIsKill);
+    MI->RemoveOperand(FIOperandNum + 1);
+  } else {
+    MI->getOperand(FIOperandNum)
+        .ChangeToRegister(FrameReg, false, false, FrameRegIsKill);
+    MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+  }
+}
diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.h b/llvm/lib/Target/CSKY/CSKYRegisterInfo.h
index 779ea6493c7e..5b3b62ec0db2 100644
--- a/llvm/lib/Target/CSKY/CSKYRegisterInfo.h
+++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.h
@@ -38,6 +38,18 @@ public:
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS) const override;
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override {
+    return false;
+  }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
index 7548c22bb2c5..ade5c7f795af 100644
--- a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
+++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
@@ -168,6 +168,11 @@ def mGPR : RegisterClass<"CSKY", [i32], 32,
   let Size = 32;
 }
 
+// Register class for SP only.
+def GPRSP : RegisterClass<"CSKY", [i32], 32, (add R14)> {
+  let Size = 32;
+}
+
 def GPRPair : RegisterClass<"CSKY", [untyped], 32, (add GPRTuple)> {
   let Size = 64;
 }
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index 7518fd774a48..ae811b30434d 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -29,6 +29,8 @@ def ProcTinyCore: SubtargetFeature<"tinycore", "HexagonProcFamily",
 // Hexagon ISA Extensions
 def ExtensionZReg: SubtargetFeature<"zreg", "UseZRegOps", "true",
       "Hexagon ZReg extension instructions">;
+def ExtensionHVXQFloat: SubtargetFeature<"hvx-qfloat", "UseHVXQFloatOps",
+      "true", "Hexagon HVX QFloating point instructions">;
 
 def ExtensionHVX: SubtargetFeature<"hvx", "HexagonHVXVersion",
       "Hexagon::ArchEnum::V60", "Hexagon HVX instructions">;
@@ -52,6 +54,10 @@ def ExtensionHVXV68: SubtargetFeature<"hvxv68", "HexagonHVXVersion",
       "Hexagon::ArchEnum::V68", "Hexagon HVX instructions",
       [ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66,
        ExtensionHVXV67]>;
+def ExtensionHVXV69: SubtargetFeature<"hvxv69", "HexagonHVXVersion",
+      "Hexagon::ArchEnum::V69", "Hexagon HVX instructions",
+      [ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66,
+       ExtensionHVXV67, ExtensionHVXV68]>;
 
 def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps",
       "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>;
@@ -61,6 +67,9 @@ def ExtensionHVX128B: SubtargetFeature<"hvx-length128b", "UseHVX128BOps",
 def ExtensionAudio: SubtargetFeature<"audio", "UseAudioOps", "true",
       "Hexagon Audio extension instructions">;
 
+def ExtensionHVXIEEEFP: SubtargetFeature<"hvx-ieee-fp", "UseHVXIEEEFPOps",
+      "true", "Hexagon HVX IEEE floating point instructions">;
+
 def FeatureCompound: SubtargetFeature<"compound", "UseCompound", "true",
       "Use compound instructions">;
 def FeaturePackets: SubtargetFeature<"packets", "UsePackets", "true",
@@ -88,6 +97,8 @@ def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19",
 def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim",
       "NoreturnStackElim", "true",
       "Eliminate stack allocation in a noreturn function when possible">;
+def FeatureCabac: SubtargetFeature<"cabac", "UseCabac", "false",
+      "Emit the CABAC instruction">;
 
 //===----------------------------------------------------------------------===//
 // Hexagon Instruction Predicate Definitions.
@@ -112,6 +123,8 @@ def UseHVXV67          : Predicate<"HST->useHVXV67Ops()">,
                          AssemblerPredicate<(all_of ExtensionHVXV67)>;
 def UseHVXV68          : Predicate<"HST->useHVXV68Ops()">,
                          AssemblerPredicate<(all_of ExtensionHVXV68)>;
+def UseHVXV69          : Predicate<"HST->useHVXV69Ops()">,
+                         AssemblerPredicate<(all_of ExtensionHVXV69)>;
 def UseAudio           : Predicate<"HST->useAudioOps()">,
                          AssemblerPredicate<(all_of ExtensionAudio)>;
 def UseZReg            : Predicate<"HST->useZRegOps()">,
@@ -119,6 +132,11 @@ def UseZReg            : Predicate<"HST->useZRegOps()">,
 def UseCompound        : Predicate<"HST->useCompound()">;
 def HasPreV65          : Predicate<"HST->hasPreV65()">,
                          AssemblerPredicate<(all_of FeaturePreV65)>;
+def UseHVXIEEEFP       : Predicate<"HST->useHVXIEEEFPOps()">,
+                         AssemblerPredicate<(all_of ExtensionHVXIEEEFP)>;
+def UseHVXQFloat       : Predicate<"HST->useHVXQFloatOps()">,
+                         AssemblerPredicate<(all_of ExtensionHVXQFloat)>;
+def UseHVXFloatingPoint: Predicate<"HST->useHVXFloatingPoint()">;
 def HasMemNoShuf       : Predicate<"HST->hasMemNoShuf()">,
                          AssemblerPredicate<(all_of FeatureMemNoShuf)>;
 def UseUnsafeMath      : Predicate<"HST->useUnsafeMath()">;
@@ -127,6 +145,8 @@ def NotOptTinyCore     : Predicate<"!HST->isTinyCore() ||"
   let RecomputePerFunction = 1;
 }
 def UseSmallData       : Predicate<"HST->useSmallData()">;
+def UseCabac           : Predicate<"HST->useCabac()">,
+                         AssemblerPredicate<(any_of FeatureCabac)>;
 
 def Hvx64:  HwMode<"+hvx-length64b">;
 def Hvx128: HwMode<"+hvx-length128b">;
@@ -299,7 +319,7 @@ def changeAddrMode_rr_ur: InstrMapping {
   let ValueCols = [["BaseLongOffset"]];
 }
 
-def changeAddrMode_ur_rr : InstrMapping {
+def changeAddrMode_ur_rr: InstrMapping {
   let FilterClass = "ImmRegShl";
   let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
   let ColFields = ["addrMode"];
@@ -370,40 +390,55 @@ class Proc<string Name, SchedMachineModel Model,
 def : Proc<"generic", HexagonModelV60,
            [ArchV5, ArchV55, ArchV60,
             FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops,
-            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData,
+            FeatureCabac]>;
 def : Proc<"hexagonv5",  HexagonModelV5,
            [ArchV5,
             FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops,
-            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData,
+            FeatureCabac]>;
 def : Proc<"hexagonv55", HexagonModelV55,
            [ArchV5, ArchV55,
             FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops,
-            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData,
+            FeatureCabac]>;
 def : Proc<"hexagonv60", HexagonModelV60,
            [ArchV5, ArchV55, ArchV60,
             FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops,
-            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData,
+            FeatureCabac]>;
 def : Proc<"hexagonv62", HexagonModelV62,
            [ArchV5, ArchV55, ArchV60, ArchV62,
             FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops,
-            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData,
+            FeatureCabac]>;
 def : Proc<"hexagonv65", HexagonModelV65,
            [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65,
             FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
-            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData,
+            FeatureCabac]>;
 def : Proc<"hexagonv66", HexagonModelV66,
            [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66,
             FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
-            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData,
+            FeatureCabac]>;
 def : Proc<"hexagonv67", HexagonModelV67,
            [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, ArchV67,
             FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
-            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData,
+            FeatureCabac]>;
 def : Proc<"hexagonv68", HexagonModelV68,
            [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, ArchV67,
             ArchV68,
             FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
-            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData,
+            FeatureCabac]>;
+def : Proc<"hexagonv69", HexagonModelV69,
+           [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, ArchV67,
+            ArchV68, ArchV69,
+            FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData,
+            FeatureCabac]>;
 // Need to update the correct features for tiny core.
 // Disable NewValueJumps since the packetizer is unable to handle a packet with
 // a new value jump and another SLOT0 instruction.
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 8e6a01e3a186..411078052e0f 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -773,6 +773,67 @@ void HexagonAsmPrinter::emitInstruction(const MachineInstr *MI) {
   OutStreamer->emitInstruction(MCB, getSubtargetInfo());
 }
 
+void HexagonAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
+  static const int8_t NoopsInSledCount = 4;
+  // We want to emit the following pattern:
+  //
+  // .L_xray_sled_N:
+  // <xray_sled_base>:
+  // { 	jump .Ltmp0 }
+  // {  nop
+  //    nop
+  //    nop
+  //    nop }
+  // .Ltmp0:
+  //
+  // We need the 4 nop words because at runtime, we'd be patching over the
+  // full 5 words with the following pattern:
+  //
+  // <xray_sled_n>:
+  // { 	immext(#...) // upper 26-bits of trampoline
+  //    r6 = ##...   // lower  6-bits of trampoline
+  //    immext(#...) // upper 26-bits of func id
+  //    r7 = ##... }  // lower 6 bits of func id
+  // { 	callr r6 }
+  //
+  //
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->emitLabel(CurSled);
+
+  MCInst *SledJump = new (OutContext) MCInst();
+  SledJump->setOpcode(Hexagon::J2_jump);
+  auto PostSled = OutContext.createTempSymbol();
+  SledJump->addOperand(MCOperand::createExpr(HexagonMCExpr::create(
+      MCSymbolRefExpr::create(PostSled, OutContext), OutContext)));
+
+  // Emit "jump PostSled" instruction, which jumps over the nop series.
+  MCInst SledJumpPacket;
+  SledJumpPacket.setOpcode(Hexagon::BUNDLE);
+  SledJumpPacket.addOperand(MCOperand::createImm(0));
+  SledJumpPacket.addOperand(MCOperand::createInst(SledJump));
+
+  EmitToStreamer(*OutStreamer, SledJumpPacket);
+
+  // FIXME: this will emit individual packets, we should
+  // special-case this and combine them into a single packet.
+  emitNops(NoopsInSledCount);
+
+  OutStreamer->emitLabel(PostSled);
+  recordSled(CurSled, MI, Kind, 0);
+}
+
+void HexagonAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::FUNCTION_ENTER);
+}
+
+void HexagonAsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::FUNCTION_EXIT);
+}
+
+void HexagonAsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::TAIL_CALL);
+}
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmPrinter() {
   RegisterAsmPrinter<HexagonAsmPrinter> X(getTheHexagonTarget());
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
index 3932def87854..93d5f1dce7af 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -36,7 +36,11 @@ class TargetMachine;
 
     bool runOnMachineFunction(MachineFunction &Fn) override {
       Subtarget = &Fn.getSubtarget<HexagonSubtarget>();
-      return AsmPrinter::runOnMachineFunction(Fn);
+      const bool Modified = AsmPrinter::runOnMachineFunction(Fn);
+      // Emit the XRay table for this function.
+      emitXRayTable();
+
+      return Modified;
     }
 
     StringRef getPassName() const override {
@@ -47,6 +51,16 @@ class TargetMachine;
           const override;
 
     void emitInstruction(const MachineInstr *MI) override;
+
+    //===------------------------------------------------------------------===//
+    // XRay implementation
+    //===------------------------------------------------------------------===//
+    // XRay-specific lowering for Hexagon.
+    void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
+    void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
+    void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
+    void EmitSled(const MachineInstr &MI, SledKind Kind);
+
     void HexagonProcessInstruction(MCInst &Inst, const MachineInstr &MBB);
 
     void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 2c5ad3b589d2..428d25da6dbc 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -995,8 +995,8 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
 
   MachineBasicBlock *B = N->getBlock();
   std::vector<MachineInstr*> Instrs;
-  for (auto I = B->rbegin(), E = B->rend(); I != E; ++I)
-    Instrs.push_back(&*I);
+  for (MachineInstr &MI : llvm::reverse(*B))
+    Instrs.push_back(&MI);
 
   for (auto MI : Instrs) {
     unsigned Opc = MI->getOpcode();
@@ -3084,8 +3084,7 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
     .addMBB(&LB);
   RegMap.insert(std::make_pair(G.Inp.Reg, PhiR));
 
-  for (unsigned i = G.Ins.size(); i > 0; --i) {
-    const MachineInstr *SI = G.Ins[i-1];
+  for (const MachineInstr *SI : llvm::reverse(G.Ins)) {
     unsigned DR = getDefReg(SI);
     const TargetRegisterClass *RC = MRI->getRegClass(DR);
     Register NewDR = MRI->createVirtualRegister(RC);
@@ -3156,20 +3155,20 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
   // if that instruction could potentially be moved to the front of the loop:
   // the output of the loop cannot be used in a non-shuffling instruction
   // in this loop.
-  for (auto I = C.LB->rbegin(), E = C.LB->rend(); I != E; ++I) {
-    if (I->isTerminator())
+  for (MachineInstr &MI : llvm::reverse(*C.LB)) {
+    if (MI.isTerminator())
       continue;
-    if (I->isPHI())
+    if (MI.isPHI())
       break;
 
     RegisterSet Defs;
-    HBS::getInstrDefs(*I, Defs);
+    HBS::getInstrDefs(MI, Defs);
     if (Defs.count() != 1)
       continue;
     Register DefR = Defs.find_first();
     if (!DefR.isVirtual())
       continue;
-    if (!isBitShuffle(&*I, DefR))
+    if (!isBitShuffle(&MI, DefR))
       continue;
 
     bool BadUse = false;
@@ -3183,8 +3182,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
           if (UseI->getOperand(Idx+1).getMBB() != C.LB)
             BadUse = true;
         } else {
-          auto F = find(ShufIns, UseI);
-          if (F == ShufIns.end())
+          if (!llvm::is_contained(ShufIns, UseI))
             BadUse = true;
         }
       } else {
@@ -3199,7 +3197,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
 
     if (BadUse)
       continue;
-    ShufIns.push_back(&*I);
+    ShufIns.push_back(&MI);
   }
 
   // Partition the list of shuffling instructions into instruction groups,
diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 8c3b9572201e..a53efeb96961 100644
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -1256,15 +1256,11 @@ void HexagonCommonGEP::removeDeadCode() {
       BO.push_back(DTN->getBlock());
   }
 
-  for (unsigned i = BO.size(); i > 0; --i) {
-    BasicBlock *B = cast<BasicBlock>(BO[i-1]);
-    BasicBlock::InstListType &IL = B->getInstList();
-
-    using reverse_iterator = BasicBlock::InstListType::reverse_iterator;
-
+  for (Value *V : llvm::reverse(BO)) {
+    BasicBlock *B = cast<BasicBlock>(V);
     ValueVect Ins;
-    for (reverse_iterator I = IL.rbegin(), E = IL.rend(); I != E; ++I)
-      Ins.push_back(&*I);
+    for (Instruction &I : llvm::reverse(*B))
+      Ins.push_back(&I);
     for (ValueVect::iterator I = Ins.begin(), E = Ins.end(); I != E; ++I) {
       Instruction *In = cast<Instruction>(*I);
       if (isInstructionTriviallyDead(In))
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.h b/llvm/lib/Target/Hexagon/HexagonDepArch.h
index 7a43a4440b2d..56174dc7e136 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.h
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.h
@@ -21,31 +21,32 @@
 
 namespace llvm {
 namespace Hexagon {
-enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66, V67, V68 };
+enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66, V67, V68, V69 };
 
-static constexpr unsigned ArchValsNumArray[] = {5, 55, 60, 62, 65, 66, 67, 68};
+static constexpr unsigned ArchValsNumArray[] = {5, 55, 60, 62, 65, 66, 67, 68, 69};
 static constexpr ArrayRef<unsigned> ArchValsNum(ArchValsNumArray);
 
-static constexpr StringLiteral ArchValsTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v68" };
+static constexpr StringLiteral ArchValsTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v68", "v69" };
 static constexpr ArrayRef<StringLiteral> ArchValsText(ArchValsTextArray);
 
-static constexpr StringLiteral CpuValsTextArray[] = { "hexagonv5", "hexagonv55", "hexagonv60", "hexagonv62", "hexagonv65", "hexagonv66", "hexagonv67", "hexagonv67t", "hexagonv68" };
+static constexpr StringLiteral CpuValsTextArray[] = { "hexagonv5", "hexagonv55", "hexagonv60", "hexagonv62", "hexagonv65", "hexagonv66", "hexagonv67", "hexagonv67t", "hexagonv68", "hexagonv69" };
 static constexpr ArrayRef<StringLiteral> CpuValsText(CpuValsTextArray);
 
-static constexpr StringLiteral CpuNickTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v67t", "v68" };
+static constexpr StringLiteral CpuNickTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v67t", "v68", "v69" };
 static constexpr ArrayRef<StringLiteral> CpuNickText(CpuNickTextArray);
 
 static const std::map<std::string, ArchEnum> CpuTable{
-  {"generic", Hexagon::ArchEnum::V5},
-  {"hexagonv5", Hexagon::ArchEnum::V5},
-  {"hexagonv55", Hexagon::ArchEnum::V55},
-  {"hexagonv60", Hexagon::ArchEnum::V60},
-  {"hexagonv62", Hexagon::ArchEnum::V62},
-  {"hexagonv65", Hexagon::ArchEnum::V65},
-  {"hexagonv66", Hexagon::ArchEnum::V66},
-  {"hexagonv67", Hexagon::ArchEnum::V67},
-  {"hexagonv67t", Hexagon::ArchEnum::V67},
-  {"hexagonv68", Hexagon::ArchEnum::V68},
+    {"generic", Hexagon::ArchEnum::V5},
+    {"hexagonv5", Hexagon::ArchEnum::V5},
+    {"hexagonv55", Hexagon::ArchEnum::V55},
+    {"hexagonv60", Hexagon::ArchEnum::V60},
+    {"hexagonv62", Hexagon::ArchEnum::V62},
+    {"hexagonv65", Hexagon::ArchEnum::V65},
+    {"hexagonv66", Hexagon::ArchEnum::V66},
+    {"hexagonv67", Hexagon::ArchEnum::V67},
+    {"hexagonv67t", Hexagon::ArchEnum::V67},
+    {"hexagonv68", Hexagon::ArchEnum::V68},
+    {"hexagonv69", Hexagon::ArchEnum::V69},
 };
 
 static const std::map<std::string, unsigned> ElfFlagsByCpuStr = {
@@ -59,6 +60,7 @@ static const std::map<std::string, unsigned> ElfFlagsByCpuStr = {
   {"hexagonv67", llvm::ELF::EF_HEXAGON_MACH_V67},
   {"hexagonv67t", llvm::ELF::EF_HEXAGON_MACH_V67T},
   {"hexagonv68", llvm::ELF::EF_HEXAGON_MACH_V68},
+  {"hexagonv69", llvm::ELF::EF_HEXAGON_MACH_V69},
 };
 static const std::map<unsigned, std::string> ElfArchByMachFlags = {
   {llvm::ELF::EF_HEXAGON_MACH_V5, "V5"},
@@ -70,6 +72,7 @@ static const std::map<unsigned, std::string> ElfArchByMachFlags = {
   {llvm::ELF::EF_HEXAGON_MACH_V67, "V67"},
   {llvm::ELF::EF_HEXAGON_MACH_V67T, "V67T"},
   {llvm::ELF::EF_HEXAGON_MACH_V68, "V68"},
+  {llvm::ELF::EF_HEXAGON_MACH_V69, "V69"},
 };
 static const std::map<unsigned, std::string> ElfCpuByMachFlags = {
   {llvm::ELF::EF_HEXAGON_MACH_V5, "hexagonv5"},
@@ -81,6 +84,7 @@ static const std::map<unsigned, std::string> ElfCpuByMachFlags = {
   {llvm::ELF::EF_HEXAGON_MACH_V67, "hexagonv67"},
   {llvm::ELF::EF_HEXAGON_MACH_V67T, "hexagonv67t"},
   {llvm::ELF::EF_HEXAGON_MACH_V68, "hexagonv68"},
+  {llvm::ELF::EF_HEXAGON_MACH_V69, "hexagonv69"},
 };
 
 } // namespace Hexagon
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.td b/llvm/lib/Target/Hexagon/HexagonDepArch.td
index e743a291f1e5..e4f24e3c2e66 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.td
@@ -24,3 +24,5 @@ def ArchV67: SubtargetFeature<"v67", "HexagonArchVersion", "Hexagon::ArchEnum::V
 def HasV67 : Predicate<"HST->hasV67Ops()">, AssemblerPredicate<(all_of ArchV67)>;
 def ArchV68: SubtargetFeature<"v68", "HexagonArchVersion", "Hexagon::ArchEnum::V68", "Enable Hexagon V68 architecture">;
 def HasV68 : Predicate<"HST->hasV68Ops()">, AssemblerPredicate<(all_of ArchV68)>;
+def ArchV69: SubtargetFeature<"v69", "HexagonArchVersion", "Hexagon::ArchEnum::V69", "Enable Hexagon V69 architecture">;
+def HasV69 : Predicate<"HST->hasV69Ops()">, AssemblerPredicate<(all_of ArchV69)>;
diff --git a/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc b/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
index 40f6e14aed13..7164af3ad5c6 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
+++ b/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
@@ -8,6 +8,7 @@
 // Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
+
 #if defined(__clang__)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-function"
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
index a1db3ae7239d..d195df918293 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -11,6 +11,7 @@
 def tc_04da405a : InstrItinClass;
 def tc_05ca8cfd : InstrItinClass;
 def tc_08a4f1b6 : InstrItinClass;
+def tc_0afc8be9 : InstrItinClass;
 def tc_0b04c6c7 : InstrItinClass;
 def tc_0ec46cf9 : InstrItinClass;
 def tc_131f1c81 : InstrItinClass;
@@ -21,6 +22,7 @@ def tc_191381c1 : InstrItinClass;
 def tc_1ad8a370 : InstrItinClass;
 def tc_1ba8a0cd : InstrItinClass;
 def tc_20a4bbec : InstrItinClass;
+def tc_2120355e : InstrItinClass;
 def tc_257f6f7c : InstrItinClass;
 def tc_26a377fe : InstrItinClass;
 def tc_2b4c548e : InstrItinClass;
@@ -28,15 +30,18 @@ def tc_2c745bb8 : InstrItinClass;
 def tc_2d4051cd : InstrItinClass;
 def tc_2e8f5f6e : InstrItinClass;
 def tc_309dbb4f : InstrItinClass;
+def tc_37820f4c : InstrItinClass;
 def tc_3904b926 : InstrItinClass;
 def tc_3aacf4a8 : InstrItinClass;
 def tc_3ad719fb : InstrItinClass;
 def tc_3c56e5ce : InstrItinClass;
+def tc_3c8c15d0 : InstrItinClass;
 def tc_3ce09744 : InstrItinClass;
 def tc_3e2aaafc : InstrItinClass;
 def tc_447d9895 : InstrItinClass;
 def tc_453fe68d : InstrItinClass;
 def tc_46d6c3e0 : InstrItinClass;
+def tc_4942646a : InstrItinClass;
 def tc_51d0ecc3 : InstrItinClass;
 def tc_52447ecc : InstrItinClass;
 def tc_540c3da3 : InstrItinClass;
@@ -46,6 +51,7 @@ def tc_56c4f9fe : InstrItinClass;
 def tc_56e64202 : InstrItinClass;
 def tc_58d21193 : InstrItinClass;
 def tc_5bf8afbb : InstrItinClass;
+def tc_5cdf8c84 : InstrItinClass;
 def tc_61bf7c03 : InstrItinClass;
 def tc_649072c2 : InstrItinClass;
 def tc_660769f1 : InstrItinClass;
@@ -57,6 +63,8 @@ def tc_71646d06 : InstrItinClass;
 def tc_7177e272 : InstrItinClass;
 def tc_718b5c53 : InstrItinClass;
 def tc_7273323b : InstrItinClass;
+def tc_72e2b393 : InstrItinClass;
+def tc_73efe966 : InstrItinClass;
 def tc_7417e785 : InstrItinClass;
 def tc_767c4e9d : InstrItinClass;
 def tc_7d68d5c2 : InstrItinClass;
@@ -71,9 +79,11 @@ def tc_9d1dc972 : InstrItinClass;
 def tc_9f363d21 : InstrItinClass;
 def tc_a02a10a8 : InstrItinClass;
 def tc_a0dbea28 : InstrItinClass;
+def tc_a19b9305 : InstrItinClass;
 def tc_a28f32b5 : InstrItinClass;
 def tc_a69eeee1 : InstrItinClass;
 def tc_a7e6707d : InstrItinClass;
+def tc_aa047364 : InstrItinClass;
 def tc_ab23f776 : InstrItinClass;
 def tc_abe8c3b2 : InstrItinClass;
 def tc_ac4046bc : InstrItinClass;
@@ -89,8 +99,10 @@ def tc_c4edf264 : InstrItinClass;
 def tc_c5dba46e : InstrItinClass;
 def tc_c7039829 : InstrItinClass;
 def tc_cd94bfe0 : InstrItinClass;
+def tc_cda936da : InstrItinClass;
 def tc_d8287c14 : InstrItinClass;
 def tc_db5555f3 : InstrItinClass;
+def tc_dcca380f : InstrItinClass;
 def tc_dd5b0695 : InstrItinClass;
 def tc_df80eeb0 : InstrItinClass;
 def tc_e2d2e9e5 : InstrItinClass;
@@ -99,6 +111,7 @@ def tc_e3f68a46 : InstrItinClass;
 def tc_e675c45a : InstrItinClass;
 def tc_e699ae41 : InstrItinClass;
 def tc_e99d4c2e : InstrItinClass;
+def tc_f175e046 : InstrItinClass;
 def tc_f1de44ef : InstrItinClass;
 def tc_f21e8abb : InstrItinClass;
 
@@ -119,6 +132,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -174,6 +192,10 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_ST]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2120355e, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
@@ -209,6 +231,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
@@ -231,6 +258,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2],
@@ -259,6 +291,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_4942646a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5],
@@ -306,6 +343,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_XLANE]>], [9, 2],
       [HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
@@ -363,6 +405,16 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
       [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_7417e785, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
@@ -437,6 +489,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_ZW]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_a19b9305, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
       [InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
@@ -456,6 +513,10 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_aa047364, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2, 5],
@@ -537,6 +598,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_cda936da, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
@@ -547,6 +613,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_ZW]>], [2, 1, 2],
@@ -589,6 +660,11 @@ class DepHVXItinV55 {
        InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_f175e046, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
       [InstrStage<1, [SLOT2], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -620,6 +696,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -675,6 +756,10 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_ST]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2120355e, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
@@ -710,6 +795,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
@@ -732,6 +822,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2],
@@ -760,6 +855,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_4942646a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5],
@@ -807,6 +907,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_XLANE]>], [9, 2],
       [HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
@@ -864,6 +969,16 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
       [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_7417e785, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
@@ -938,6 +1053,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_ZW]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_a19b9305, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
       [InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
@@ -957,6 +1077,10 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_aa047364, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2, 5],
@@ -1038,6 +1162,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_cda936da, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
@@ -1048,6 +1177,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_ZW]>], [2, 1, 2],
@@ -1090,6 +1224,11 @@ class DepHVXItinV60 {
        InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_f175e046, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
       [InstrStage<1, [SLOT2], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -1121,6 +1260,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -1176,6 +1320,10 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_ST]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2120355e, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
@@ -1211,6 +1359,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
@@ -1233,6 +1386,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2],
@@ -1261,6 +1419,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_4942646a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5],
@@ -1308,6 +1471,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_XLANE]>], [9, 2],
       [HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
@@ -1365,6 +1533,16 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
       [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_7417e785, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
@@ -1439,6 +1617,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_ZW]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_a19b9305, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
       [InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
@@ -1458,6 +1641,10 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_aa047364, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2, 5],
@@ -1539,6 +1726,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_cda936da, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
@@ -1549,6 +1741,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_ZW]>], [2, 1, 2],
@@ -1591,6 +1788,11 @@ class DepHVXItinV62 {
        InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_f175e046, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
       [InstrStage<1, [SLOT2], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -1622,6 +1824,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -1677,6 +1884,10 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_ST]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2120355e, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
@@ -1712,6 +1923,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
@@ -1734,6 +1950,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2],
@@ -1762,6 +1983,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_4942646a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5],
@@ -1809,6 +2035,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_XLANE]>], [9, 2],
       [HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
@@ -1866,6 +2097,16 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
       [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_7417e785, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
@@ -1940,6 +2181,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_ZW]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_a19b9305, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
       [InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
@@ -1959,6 +2205,10 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_aa047364, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2, 5],
@@ -2040,6 +2290,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_cda936da, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
@@ -2050,6 +2305,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_ZW]>], [2, 1, 2],
@@ -2092,6 +2352,11 @@ class DepHVXItinV65 {
        InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_f175e046, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
       [InstrStage<1, [SLOT2], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -2123,6 +2388,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -2178,6 +2448,10 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_ST]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2120355e, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
@@ -2213,6 +2487,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
@@ -2235,6 +2514,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2],
@@ -2263,6 +2547,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_4942646a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5],
@@ -2310,6 +2599,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_XLANE]>], [9, 2],
       [HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
@@ -2367,6 +2661,16 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
       [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_7417e785, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
@@ -2441,6 +2745,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_ZW]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_a19b9305, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
       [InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
@@ -2460,6 +2769,10 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_aa047364, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2, 5],
@@ -2541,6 +2854,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_cda936da, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
@@ -2551,6 +2869,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_ZW]>], [2, 1, 2],
@@ -2593,6 +2916,11 @@ class DepHVXItinV66 {
        InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_f175e046, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
       [InstrStage<1, [SLOT2], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -2624,6 +2952,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -2679,6 +3012,10 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_ST]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2120355e, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
@@ -2714,6 +3051,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
@@ -2736,6 +3078,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2],
@@ -2764,6 +3111,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_4942646a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5],
@@ -2811,6 +3163,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_XLANE]>], [9, 2],
       [HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
@@ -2868,6 +3225,16 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
       [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_7417e785, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
@@ -2942,6 +3309,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_ZW]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_a19b9305, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/
       [InstrStage<1, [SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
@@ -2961,6 +3333,10 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_aa047364, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2, 5],
@@ -3042,6 +3418,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_cda936da, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
@@ -3052,6 +3433,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_ZW]>], [2, 1, 2],
@@ -3094,6 +3480,11 @@ class DepHVXItinV67 {
        InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_f175e046, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
       [InstrStage<1, [SLOT2], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -3125,6 +3516,575 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
+
+    InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2120355e, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_4942646a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_540c3da3, /*SLOT0,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_56e64202, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_649072c2, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7095ecba, /*SLOT01,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7177e272, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7417e785, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7d68d5c2, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_8772086c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_946013d8, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a19b9305, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a28f32b5, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a69eeee1, /*SLOT01,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_aa047364, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac4046bc, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b091f1c6, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c127de3a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c4edf264, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cda936da, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e699ae41, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_f175e046, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>
+  ];
+}
+
+class DepHVXItinV69 {
+  list<InstrItinData> DepHVXItinV69_list = [
+    InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
@@ -3180,6 +4140,10 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_ST]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_2120355e, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
@@ -3215,6 +4179,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
       [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_37820f4c, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
@@ -3237,6 +4206,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2],
@@ -3265,6 +4239,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_4942646a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5],
@@ -3312,6 +4291,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_XLANE]>], [9, 2],
       [HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
@@ -3369,6 +4353,16 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
       [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_72e2b393, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_73efe966, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_7417e785, /*SLOT0123,VS*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
@@ -3443,6 +4437,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_ZW]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_a19b9305, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_a28f32b5, /*SLOT01,LOAD,VA*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_LD], 0>,
@@ -3462,6 +4461,10 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
       [HVX_FWD, Hex_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_aa047364, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
       [InstrStage<1, [SLOT0], 0>,
        InstrStage<1, [CVI_ST]>], [1, 2, 5],
@@ -3543,6 +4546,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
       [HVX_FWD, HVX_FWD, Hex_FWD]>,
 
+    InstrItinData <tc_cda936da, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
     InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
       [InstrStage<1, [SLOT2, SLOT3], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
@@ -3553,6 +4561,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
       [HVX_FWD, HVX_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_dcca380f, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
       [InstrStage<1, [SLOT0, SLOT1], 0>,
        InstrStage<1, [CVI_ZW]>], [2, 1, 2],
@@ -3595,6 +4608,11 @@ class DepHVXItinV68 {
        InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
 
+    InstrItinData <tc_f175e046, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
     InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
       [InstrStage<1, [SLOT2], 0>,
        InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
index a3766652794b..a979bafe8e33 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
@@ -7338,3 +7338,771 @@ class DepScalarItinV68 {
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
   ];
 }
+
+class DepScalarItinV69 {
+  list<InstrItinData> DepScalarItinV69_list = [
+    InstrItinData <tc_011e0e9d, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01d44cb2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01e1be3b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_02fe1c65, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0655b949, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_075c8dd8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0a195f2c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0a6c20ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ba0d5da, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0dfac0a7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0fac1eb8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_112d30d6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1242dc2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1248597c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_151bf368, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_158aa3f7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_197dce51, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1981450d, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1c2c7a4a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c7522a8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1d41f8b7, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fcb8495, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fe4ab69, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20131976, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2237d952, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_23708a21, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_2471c1c8, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_24e109c7, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_24f426ab, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_280f7fe1, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_28e55c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c13e7f5, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c3e17fc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_2f573607, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_362b0be2, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_38382228, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_388f9897, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_38e0bae9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d14a17b, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3edca78f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3fbf1042, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_407e96f9, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_40d64c94, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4222e6bf, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_42ff66ba, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_442395f3, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_449acf79, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44d5a428, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44fffc58, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_45791fb8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_45f9d1be, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_49fdfd4b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4a55d03c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4abdbdc6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4bf903b0, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_503ce0f3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_53c851ab, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5502c366, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55255f2b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_556f6577, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55a9a350, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55b33fda, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56a124a7, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_57a55b54, /*tc_1*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5944960d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_59a7822c, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a4b5e58, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5b347363, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5ceb2f9e, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5da50c4b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5deb5e47, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5e4cf0e8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_60e324ff, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_63567288, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_64b00d8a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_651cbe02, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65279839, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65cbd974, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_69bfb303, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6ae3426b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6d861a95, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6e20402a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6f42bc60, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fc5dbea, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_711c805f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_713b66bf, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7401744f, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7476d766, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_74a42bda, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76bb5435, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_77f94a5e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_788b1d09, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7af3a37e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7b9187d3, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c31e19a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c6d32e4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f7f45f5, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f8ae742, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8035e91f, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_822c3c68, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_829d8a86, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_838c4d7a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_84a7500d, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_86173609, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_887d1bb7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8a6d0d94, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8a825db2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8b5bd4f5, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e82e8ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9124c04f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_92240447, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_934753bb, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_937dd41c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
+
+    InstrItinData <tc_9406230a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_95a33176, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_96ef76ef, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_975a4e54, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9783714b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9b34f5e0, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_9b3c0462, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9bcfb2ee, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c52f549, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e27f2f9, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e72dc89, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edb7c77, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edefe01, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f6cd987, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a08b630b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1297125, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a154b476, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a2b365d2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a3070909, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a32e03e7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a38c45dc, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4e22bbd, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4ee89db, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_a7a13fac, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7bdb22c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a9edeffa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_abfd9a6d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac65613f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_addc37a8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae5babd7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_aee6250c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b1ae5f67, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b4dc7630, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b7c4062a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b837298f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_ba9255a6, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb07f2c5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb831a7c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c20701f0, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c21d7447, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c57d9f39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c818ff7f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_ce59038e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfa0e29b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d03278fd, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d33e5eee, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d3632d88, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d45ba9cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d57d649c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d61dfdc3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d68dca5c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d7718fbe, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_db596beb, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db96aa6b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_dc51281d, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_decdde8a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e3d699e3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e9170fb7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ed03645c, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eed07714, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eeda4109, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef921005, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f098b237, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0cdeccf, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0e8e832, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f34c1c21, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f38f92e1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f529831b, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f7569068, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f999c66e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fae9dfa5, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fedb7e19, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
+  ];
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
index b3f1b6638193..65d36924ba48 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
@@ -2288,6 +2288,12 @@ class Enc_a30110 : OpcodeHexagon {
   bits <5> Vd32;
   let Inst{4-0} = Vd32{4-0};
 }
+class Enc_a33d04 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
 class Enc_a42857 : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
@@ -3109,6 +3115,14 @@ class Enc_de0214 : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
+class Enc_de5ea0 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
 class Enc_e07374 : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
index 4f00409c336c..c02988266584 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -5824,8 +5824,8 @@ let Inst{31-21} = 0b01010100100;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isSolo = 1;
-let Uses = [GOSP];
-let Defs = [GOSP, PC];
+let Uses = [CCR, GOSP];
+let Defs = [CCR, GOSP, PC];
 let hasSideEffects = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -8500,6 +8500,8 @@ let Inst{31-21} = 0b01010010101;
 let isTerminator = 1;
 let isIndirectBranch = 1;
 let isBranch = 1;
+let cofRelax1 = 1;
+let cofRelax2 = 1;
 let cofMax1 = 1;
 }
 def J4_jumpseti : HInst<
@@ -18210,16 +18212,6 @@ let opExtentBits = 18;
 let opExtentAlign = 2;
 let opNewValue = 1;
 }
-def PS_trap1 : HInst<
-(outs),
-(ins u8_0Imm:$Ii),
-"trap1(#$Ii)",
-tc_53c851ab, TypeJ>, Enc_a51a9a, Requires<[HasPreV65]> {
-let Inst{1-0} = 0b00;
-let Inst{7-5} = 0b000;
-let Inst{13-13} = 0b0;
-let Inst{31-16} = 0b0101010010000000;
-}
 def R6_release_at_vi : HInst<
 (outs),
 (ins IntRegs:$Rs32),
@@ -18964,7 +18956,7 @@ def S2_cabacdecbin : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = decbin($Rss32,$Rtt32)",
-tc_db596beb, TypeS_3op>, Enc_a56825 {
+tc_db596beb, TypeS_3op>, Enc_a56825, Requires<[UseCabac]> {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001110;
@@ -26883,17 +26875,6 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
-def V6_ldntnt0 : HInst<
-(outs HvxVR:$Vd32),
-(ins IntRegs:$Rt32),
-"$Vd32 = vmem($Rt32):nt",
-PSEUDO, TypeMAPPING>, Requires<[HasV62]> {
-let hasNewValue = 1;
-let opNewValue = 0;
-let isPseudo = 1;
-let isCodeGenOnly = 1;
-let DecoderNamespace = "EXT_mmvec";
-}
 def V6_ldp0 : HInst<
 (outs HvxVR:$Vd32),
 (ins PredRegs:$Pv4, IntRegs:$Rt32),
@@ -27312,6 +27293,30 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_v10mpyubs10 : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxWR:$Vuu32, HvxWR:$Vvv32, u1_0Imm:$Ii),
+"$Vdd32.w = v10mpy($Vuu32.ub,$Vvv32.b,#$Ii)",
+tc_f175e046, TypeCVI_VX>, Requires<[UseHVXV69]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_v10mpyubs10_vxx : HInst<
+(outs HvxWR:$Vxx32),
+(ins HvxWR:$Vxx32in, HvxWR:$Vuu32, HvxWR:$Vvv32, u1_0Imm:$Ii),
+"$Vxx32.w += v10mpy($Vuu32.ub,$Vvv32.b,#$Ii)",
+tc_4942646a, TypeCVI_VX>, Requires<[UseHVXV69]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isCVI = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
 def V6_v6mpyhubs10 : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32, HvxWR:$Vvv32, u2_0Imm:$Ii),
@@ -27396,7 +27401,7 @@ def V6_vL32Ub_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32 = vmemu($Rt32+#$Ii)",
-tc_a7e6707d, TypeCVI_VM_VP_LDU>, Enc_f3f408, Requires<[UseHVXV60]> {
+tc_a7e6707d, TypeCVI_VM_VP_LDU>, Enc_f3f408, Requires<[UseHVXV60]>, PostInc_BaseImm {
 let Inst{7-5} = 0b111;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000000;
@@ -27408,13 +27413,15 @@ let isCVLoad = 1;
 let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
+let BaseOpcode = "V6_vL32Ub_ai";
+let CextOpcode = "V6_vL32Ub";
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vL32Ub_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32 = vmemu($Rx32++#$Ii)",
-tc_3c56e5ce, TypeCVI_VM_VP_LDU>, Enc_a255dc, Requires<[UseHVXV60]> {
+tc_3c56e5ce, TypeCVI_VM_VP_LDU>, Enc_a255dc, Requires<[UseHVXV60]>, PostInc_BaseImm {
 let Inst{7-5} = 0b111;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001000;
@@ -27427,6 +27434,7 @@ let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_pi";
+let CextOpcode = "V6_vL32Ub";
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -27452,7 +27460,7 @@ def V6_vL32b_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32 = vmem($Rt32+#$Ii)",
-tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000000;
@@ -27465,6 +27473,7 @@ let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_ai";
+let CextOpcode = "V6_vL32b";
 let isCVLoadable = 1;
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -27473,7 +27482,7 @@ def V6_vL32b_cur_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32.cur = vmem($Rt32+#$Ii)",
-tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b001;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000000;
@@ -27487,6 +27496,7 @@ let CVINew = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_cur_ai";
+let CextOpcode = "V6_vL32b_cur";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -27560,7 +27570,7 @@ def V6_vL32b_cur_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32.cur = vmem($Rx32++#$Ii)",
-tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b001;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001000;
@@ -27574,6 +27584,7 @@ let CVINew = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_cur_pi";
+let CextOpcode = "V6_vL32b_cur";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
@@ -27729,7 +27740,7 @@ def V6_vL32b_nt_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32 = vmem($Rt32+#$Ii):nt",
-tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000010;
@@ -27743,6 +27754,7 @@ let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_nt_ai";
+let CextOpcode = "V6_vL32b_nt";
 let isCVLoadable = 1;
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -27751,7 +27763,7 @@ def V6_vL32b_nt_cur_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32.cur = vmem($Rt32+#$Ii):nt",
-tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b001;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000010;
@@ -27766,6 +27778,7 @@ let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_nt_cur_ai";
+let CextOpcode = "V6_vL32b_nt_cur";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -27842,7 +27855,7 @@ def V6_vL32b_nt_cur_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32.cur = vmem($Rx32++#$Ii):nt",
-tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b001;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001010;
@@ -27857,6 +27870,7 @@ let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_nt_cur_pi";
+let CextOpcode = "V6_vL32b_nt_cur";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
@@ -28019,7 +28033,7 @@ def V6_vL32b_nt_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32 = vmem($Rx32++#$Ii):nt",
-tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b000;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001010;
@@ -28033,6 +28047,7 @@ let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_nt_pi";
+let CextOpcode = "V6_vL32b_nt";
 let isCVLoadable = 1;
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28127,7 +28142,7 @@ def V6_vL32b_nt_tmp_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32.tmp = vmem($Rt32+#$Ii):nt",
-tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b010;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000010;
@@ -28137,11 +28152,12 @@ let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_nt_tmp_ai";
+let CextOpcode = "V6_vL32b_nt_tmp";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -28160,7 +28176,7 @@ let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -28183,7 +28199,7 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -28206,7 +28222,7 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -28218,7 +28234,7 @@ def V6_vL32b_nt_tmp_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32.tmp = vmem($Rx32++#$Ii):nt",
-tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b010;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001010;
@@ -28228,11 +28244,12 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_nt_tmp_pi";
+let CextOpcode = "V6_vL32b_nt_tmp";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
@@ -28250,7 +28267,7 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -28273,7 +28290,7 @@ let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -28295,7 +28312,7 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -28317,7 +28334,7 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -28329,7 +28346,7 @@ def V6_vL32b_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32 = vmem($Rx32++#$Ii)",
-tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b000;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001000;
@@ -28342,6 +28359,7 @@ let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_pi";
+let CextOpcode = "V6_vL32b";
 let isCVLoadable = 1;
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28432,7 +28450,7 @@ def V6_vL32b_tmp_ai : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "$Vd32.tmp = vmem($Rt32+#$Ii)",
-tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel {
+tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b010;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000000;
@@ -28442,10 +28460,11 @@ let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ai";
+let CextOpcode = "V6_vL32b_tmp";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -28464,7 +28483,7 @@ let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ai";
@@ -28486,7 +28505,7 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_pi";
@@ -28508,7 +28527,7 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ppu";
@@ -28519,7 +28538,7 @@ def V6_vL32b_tmp_pi : HInst<
 (outs HvxVR:$Vd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "$Vd32.tmp = vmem($Rx32++#$Ii)",
-tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel {
+tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm {
 let Inst{7-5} = 0b010;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001000;
@@ -28529,10 +28548,11 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_pi";
+let CextOpcode = "V6_vL32b_tmp";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
@@ -28550,7 +28570,7 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ppu";
@@ -28572,7 +28592,7 @@ let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ai";
@@ -28593,7 +28613,7 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_pi";
@@ -28614,7 +28634,7 @@ let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
+let hasHvxTmp = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ppu";
@@ -28625,7 +28645,7 @@ def V6_vS32Ub_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "vmemu($Rt32+#$Ii) = $Vs32",
-tc_f21e8abb, TypeCVI_VM_STU>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel {
+tc_f21e8abb, TypeCVI_VM_STU>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm {
 let Inst{7-5} = 0b111;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000001;
@@ -28634,6 +28654,7 @@ let accessSize = HVXVectorAccess;
 let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32Ub_ai";
+let CextOpcode = "V6_vS32Ub";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -28692,7 +28713,7 @@ def V6_vS32Ub_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "vmemu($Rx32++#$Ii) = $Vs32",
-tc_e2d2e9e5, TypeCVI_VM_STU>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel {
+tc_e2d2e9e5, TypeCVI_VM_STU>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm {
 let Inst{7-5} = 0b111;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001001;
@@ -28701,6 +28722,7 @@ let accessSize = HVXVectorAccess;
 let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32Ub_pi";
+let CextOpcode = "V6_vS32Ub";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
@@ -28773,7 +28795,7 @@ def V6_vS32b_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "vmem($Rt32+#$Ii) = $Vs32",
-tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel {
+tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm {
 let Inst{7-5} = 0b000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000001;
@@ -28782,6 +28804,7 @@ let accessSize = HVXVectorAccess;
 let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ai";
+let CextOpcode = "V6_vS32b";
 let isNVStorable = 1;
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28790,7 +28813,7 @@ def V6_vS32b_new_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8),
 "vmem($Rt32+#$Ii) = $Os8.new",
-tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel {
+tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm {
 let Inst{7-3} = 0b00100;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000001;
@@ -28802,6 +28825,7 @@ let CVINew = 1;
 let isNewValue = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ai";
+let CextOpcode = "V6_vS32b_new";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 let opNewValue = 2;
@@ -28873,7 +28897,7 @@ def V6_vS32b_new_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8),
 "vmem($Rx32++#$Ii) = $Os8.new",
-tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel {
+tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm {
 let Inst{7-3} = 0b00100;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001001;
@@ -28885,6 +28909,7 @@ let CVINew = 1;
 let isNewValue = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_pi";
+let CextOpcode = "V6_vS32b_new";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 let opNewValue = 3;
@@ -29070,7 +29095,7 @@ def V6_vS32b_nt_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32),
 "vmem($Rt32+#$Ii):nt = $Vs32",
-tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel {
+tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm {
 let Inst{7-5} = 0b000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000011;
@@ -29080,6 +29105,7 @@ let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ai";
+let CextOpcode = "V6_vS32b_nt";
 let isNVStorable = 1;
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29088,7 +29114,7 @@ def V6_vS32b_nt_new_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8),
 "vmem($Rt32+#$Ii):nt = $Os8.new",
-tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel {
+tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm {
 let Inst{7-3} = 0b00100;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000011;
@@ -29101,6 +29127,7 @@ let isNewValue = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ai";
+let CextOpcode = "V6_vS32b_nt_new";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 let opNewValue = 2;
@@ -29175,7 +29202,7 @@ def V6_vS32b_nt_new_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8),
 "vmem($Rx32++#$Ii):nt = $Os8.new",
-tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel {
+tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm {
 let Inst{7-3} = 0b00100;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001011;
@@ -29188,6 +29215,7 @@ let isNewValue = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_pi";
+let CextOpcode = "V6_vS32b_nt_new";
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
 let opNewValue = 3;
@@ -29383,7 +29411,7 @@ def V6_vS32b_nt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "vmem($Rx32++#$Ii):nt = $Vs32",
-tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel {
+tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm {
 let Inst{7-5} = 0b000;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001011;
@@ -29393,6 +29421,7 @@ let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_pi";
+let CextOpcode = "V6_vS32b_nt";
 let isNVStorable = 1;
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29519,7 +29548,7 @@ def V6_vS32b_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32),
 "vmem($Rx32++#$Ii) = $Vs32",
-tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel {
+tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm {
 let Inst{7-5} = 0b000;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001001;
@@ -29528,6 +29557,7 @@ let accessSize = HVXVectorAccess;
 let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_pi";
+let CextOpcode = "V6_vS32b";
 let isNVStorable = 1;
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29689,6 +29719,32 @@ let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
 }
+def V6_vabs_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.hf = vabs($Vu32.hf)",
+tc_5cdf8c84, TypeCVI_VX_LATE>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabs_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.sf = vabs($Vu32.sf)",
+tc_5cdf8c84, TypeCVI_VX_LATE>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vabsb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32),
@@ -29975,6 +30031,123 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vadd_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vadd($Vu32.hf,$Vv32.hf)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadd_hf_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.hf = vadd($Vu32.hf,$Vv32.hf)",
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadd_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vadd($Vu32.qf16,$Vv32.qf16)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadd_qf16_mix : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vadd($Vu32.qf16,$Vv32.hf)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadd_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf32 = vadd($Vu32.qf32,$Vv32.qf32)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadd_qf32_mix : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf32 = vadd($Vu32.qf32,$Vv32.sf)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadd_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf32 = vadd($Vu32.sf,$Vv32.sf)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadd_sf_hf : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vdd32.sf = vadd($Vu32.hf,$Vv32.hf)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadd_sf_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.sf = vadd($Vu32.sf,$Vv32.sf)",
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vaddb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -31440,6 +31613,58 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vasrvuhubrndsat : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxWR:$Vuu32, HvxVR:$Vv32),
+"$Vd32.ub = vasr($Vuu32.uh,$Vv32.ub):rnd:sat",
+tc_05ca8cfd, TypeCVI_VS>, Enc_de5ea0, Requires<[UseHVXV69]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrvuhubsat : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxWR:$Vuu32, HvxVR:$Vv32),
+"$Vd32.ub = vasr($Vuu32.uh,$Vv32.ub):sat",
+tc_05ca8cfd, TypeCVI_VS>, Enc_de5ea0, Requires<[UseHVXV69]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrvwuhrndsat : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxWR:$Vuu32, HvxVR:$Vv32),
+"$Vd32.uh = vasr($Vuu32.w,$Vv32.uh):rnd:sat",
+tc_05ca8cfd, TypeCVI_VS>, Enc_de5ea0, Requires<[UseHVXV69]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrvwuhsat : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxWR:$Vuu32, HvxVR:$Vv32),
+"$Vd32.uh = vasr($Vuu32.w,$Vv32.uh):sat",
+tc_05ca8cfd, TypeCVI_VS>, Enc_de5ea0, Requires<[UseHVXV69]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vasrw : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
@@ -31597,6 +31822,33 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vassign_fp : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.w = vfmv($Vu32.w)",
+tc_5cdf8c84, TypeCVI_VX_LATE>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vassign_tmp : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.tmp = $Vu32",
+tc_2120355e, TypeCVI_VX>, Enc_e7581c, Requires<[UseHVXV69]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let hasHvxTmp = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vassignp : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxWR:$Vuu32),
@@ -32000,6 +32252,189 @@ let isCVI = 1;
 let isRegSequence = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vcombine_tmp : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vdd32.tmp = vcombine($Vu32,$Vv32)",
+tc_aa047364, TypeCVI_VX>, Enc_71bb9b, Requires<[UseHVXV69]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let hasHvxTmp = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_hf_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.hf = $Vu32.qf16",
+tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_hf_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxWR:$Vuu32),
+"$Vd32.hf = $Vuu32.qf32",
+tc_51d0ecc3, TypeCVI_VS>, Enc_a33d04, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vconv_sf_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.sf = $Vu32.qf32",
+tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcvt_b_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.b = vcvt($Vu32.hf,$Vv32.hf)",
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcvt_h_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.h = vcvt($Vu32.hf)",
+tc_3c8c15d0, TypeCVI_VX>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcvt_hf_b : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32),
+"$Vdd32.hf = vcvt($Vu32.b)",
+tc_0afc8be9, TypeCVI_VX_DV>, Enc_dd766a, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcvt_hf_h : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.hf = vcvt($Vu32.h)",
+tc_3c8c15d0, TypeCVI_VX>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcvt_hf_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.hf = vcvt($Vu32.sf,$Vv32.sf)",
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcvt_hf_ub : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32),
+"$Vdd32.hf = vcvt($Vu32.ub)",
+tc_0afc8be9, TypeCVI_VX_DV>, Enc_dd766a, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcvt_hf_uh : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.hf = vcvt($Vu32.uh)",
+tc_3c8c15d0, TypeCVI_VX>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcvt_sf_hf : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32),
+"$Vdd32.sf = vcvt($Vu32.hf)",
+tc_0afc8be9, TypeCVI_VX_DV>, Enc_dd766a, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcvt_ub_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.ub = vcvt($Vu32.hf,$Vv32.hf)",
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcvt_uh_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.uh = vcvt($Vu32.hf)",
+tc_3c8c15d0, TypeCVI_VX>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vd0 : HInst<
 (outs HvxVR:$Vd32),
 (ins),
@@ -32141,6 +32576,34 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vdmpy_sf_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.sf = vdmpy($Vu32.hf,$Vv32.hf)",
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpy_sf_hf_acc : HInst<
+(outs HvxVR:$Vx32),
+(ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vx32.sf += vdmpy($Vu32.hf,$Vv32.hf)",
+tc_a19b9305, TypeCVI_VX>, Enc_a7341a, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
 def V6_vdmpybus : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
@@ -32415,7 +32878,7 @@ def V6_vdmpyhsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.w = vdmpy($Vu32.h,$Rt32.h):sat",
-tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_dcca380f, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
@@ -32428,7 +32891,7 @@ def V6_vdmpyhsat_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.w += vdmpy($Vu32.h,$Rt32.h):sat",
-tc_660769f1, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_72e2b393, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001001;
@@ -32523,7 +32986,7 @@ def V6_vdmpyhsusat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.w = vdmpy($Vu32.h,$Rt32.uh):sat",
-tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_dcca380f, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
@@ -32536,7 +32999,7 @@ def V6_vdmpyhsusat_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vx32.w += vdmpy($Vu32.h,$Rt32.uh):sat",
-tc_660769f1, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> {
+tc_72e2b393, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001001;
@@ -32577,7 +33040,7 @@ def V6_vdmpyhvsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w = vdmpy($Vu32.h,$Vv32.h):sat",
-tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_73efe966, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
@@ -32831,6 +33294,84 @@ let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
+def V6_vfmax_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.hf = vfmax($Vu32.hf,$Vv32.hf)",
+tc_cda936da, TypeCVI_VX_LATE>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vfmax_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.sf = vfmax($Vu32.sf,$Vv32.sf)",
+tc_cda936da, TypeCVI_VX_LATE>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vfmin_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.hf = vfmin($Vu32.hf,$Vv32.hf)",
+tc_cda936da, TypeCVI_VX_LATE>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vfmin_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.sf = vfmin($Vu32.sf,$Vv32.sf)",
+tc_cda936da, TypeCVI_VX_LATE>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vfneg_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.hf = vfneg($Vu32.hf)",
+tc_5cdf8c84, TypeCVI_VX_LATE>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vfneg_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32),
+"$Vd32.sf = vfneg($Vu32.sf)",
+tc_5cdf8c84, TypeCVI_VX_LATE>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vgathermh : HInst<
 (outs),
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32),
@@ -32843,7 +33384,6 @@ let opNewValue = 0;
 let accessSize = HalfWordAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
 let DecoderNamespace = "EXT_mmvec";
@@ -32860,7 +33400,6 @@ let opNewValue = 0;
 let accessSize = HalfWordAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
 let DecoderNamespace = "EXT_mmvec";
@@ -32877,7 +33416,6 @@ let opNewValue = 0;
 let accessSize = HalfWordAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
 let DecoderNamespace = "EXT_mmvec";
@@ -32894,7 +33432,6 @@ let opNewValue = 0;
 let accessSize = HalfWordAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
 let DecoderNamespace = "EXT_mmvec";
@@ -32911,7 +33448,6 @@ let opNewValue = 0;
 let accessSize = WordAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
 let DecoderNamespace = "EXT_mmvec";
@@ -32928,7 +33464,6 @@ let opNewValue = 0;
 let accessSize = WordAccess;
 let isCVLoad = 1;
 let isCVI = 1;
-let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
 let DecoderNamespace = "EXT_mmvec";
@@ -33033,6 +33568,106 @@ let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
+def V6_vgthf : HInst<
+(outs HvxQR:$Qd4),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.hf,$Vv32.hf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV68,UseHVXFloatingPoint]> {
+let Inst{7-2} = 0b011101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgthf_and : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> {
+let Inst{7-2} = 0b110011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgthf_or : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> {
+let Inst{7-2} = 0b001101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isAccumulator = 1;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgthf_xor : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.hf,$Vv32.hf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> {
+let Inst{7-2} = 0b111011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtsf : HInst<
+(outs HvxQR:$Qd4),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.sf,$Vv32.sf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV68,UseHVXFloatingPoint]> {
+let Inst{7-2} = 0b011100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtsf_and : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> {
+let Inst{7-2} = 0b110010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtsf_or : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> {
+let Inst{7-2} = 0b001100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isAccumulator = 1;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtsf_xor : HInst<
+(outs HvxQR:$Qx4),
+(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.sf,$Vv32.sf)",
+tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> {
+let Inst{7-2} = 0b111010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
 def V6_vgtub : HInst<
 (outs HvxQR:$Qd4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -33552,6 +34187,32 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vmax_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.hf = vmax($Vu32.hf,$Vv32.hf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmax_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.sf = vmax($Vu32.sf,$Vv32.sf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vmaxb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -33677,6 +34338,32 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vmin_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.hf = vmin($Vu32.hf,$Vv32.hf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmin_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.sf = vmin($Vu32.sf,$Vv32.sf)",
+tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vminb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -34110,6 +34797,179 @@ let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
+def V6_vmpy_hf_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.hf = vmpy($Vu32.hf,$Vv32.hf)",
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpy_hf_hf_acc : HInst<
+(outs HvxVR:$Vx32),
+(ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vx32.hf += vmpy($Vu32.hf,$Vv32.hf)",
+tc_a19b9305, TypeCVI_VX>, Enc_a7341a, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpy_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vmpy($Vu32.qf16,$Vv32.qf16)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpy_qf16_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vmpy($Vu32.hf,$Vv32.hf)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpy_qf16_mix_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vmpy($Vu32.qf16,$Vv32.hf)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpy_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf32 = vmpy($Vu32.qf32,$Vv32.qf32)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpy_qf32_hf : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vdd32.qf32 = vmpy($Vu32.hf,$Vv32.hf)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpy_qf32_mix_hf : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vdd32.qf32 = vmpy($Vu32.qf16,$Vv32.hf)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpy_qf32_qf16 : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vdd32.qf32 = vmpy($Vu32.qf16,$Vv32.qf16)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpy_qf32_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf32 = vmpy($Vu32.sf,$Vv32.sf)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpy_sf_hf : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vdd32.sf = vmpy($Vu32.hf,$Vv32.hf)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpy_sf_hf_acc : HInst<
+(outs HvxWR:$Vxx32),
+(ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vxx32.sf += vmpy($Vu32.hf,$Vv32.hf)",
+tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpy_sf_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.sf = vmpy($Vu32.sf,$Vv32.sf)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vmpybus : HInst<
 (outs HvxWR:$Vdd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
@@ -34397,7 +35257,7 @@ def V6_vmpyhsrs : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:rnd:sat",
-tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_dcca380f, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
@@ -34422,7 +35282,7 @@ def V6_vmpyhss : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, IntRegs:$Rt32),
 "$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:sat",
-tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> {
+tc_dcca380f, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
@@ -34555,7 +35415,7 @@ def V6_vmpyhvsrs : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vmpy($Vu32.h,$Vv32.h):<<1:rnd:sat",
-tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_73efe966, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
@@ -35332,6 +36192,19 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vmpyuhvs : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.uh = vmpy($Vu32.uh,$Vv32.uh):>>16",
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV69]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vmux : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxQR:$Qt4, HvxVR:$Vu32, HvxVR:$Vv32),
@@ -36007,7 +36880,7 @@ def V6_vrmpybusv_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.w += vrmpy($Vu32.ub,$Vv32.b)",
-tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_37820f4c, TypeCVI_VX>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100000;
@@ -36061,7 +36934,7 @@ def V6_vrmpybv_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.w += vrmpy($Vu32.b,$Vv32.b)",
-tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_37820f4c, TypeCVI_VX>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100000;
@@ -36277,7 +37150,7 @@ def V6_vrmpyubv_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vx32.uw += vrmpy($Vu32.ub,$Vv32.ub)",
-tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> {
+tc_37820f4c, TypeCVI_VX>, Enc_a7341a, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100000;
@@ -37412,6 +38285,123 @@ let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vsub_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vsub($Vu32.hf,$Vv32.hf)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsub_hf_hf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.hf = vsub($Vu32.hf,$Vv32.hf)",
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsub_qf16 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vsub($Vu32.qf16,$Vv32.qf16)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsub_qf16_mix : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vsub($Vu32.qf16,$Vv32.hf)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsub_qf32 : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf32 = vsub($Vu32.qf32,$Vv32.qf32)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsub_qf32_mix : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf32 = vsub($Vu32.qf32,$Vv32.sf)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsub_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf32 = vsub($Vu32.sf,$Vv32.sf)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsub_sf_hf : HInst<
+(outs HvxWR:$Vdd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vdd32.sf = vsub($Vu32.hf,$Vv32.hf)",
+tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsub_sf_sf : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.sf = vsub($Vu32.sf,$Vv32.sf)",
+tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vsubb : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -38647,7 +39637,7 @@ def V6_zLd_ai : HInst<
 (outs),
 (ins IntRegs:$Rt32, s4_0Imm:$Ii),
 "z = vmem($Rt32+#$Ii)",
-tc_e699ae41, TypeCVI_ZW>, Enc_ff3442, Requires<[UseHVXV66,UseZReg]> {
+tc_e699ae41, TypeCVI_ZW>, Enc_ff3442, Requires<[UseHVXV66,UseZReg]>, PostInc_BaseImm {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101100000;
@@ -38655,13 +39645,14 @@ let addrMode = BaseImmOffset;
 let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
+let CextOpcode = "V6_zLd";
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_zLd_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s3_0Imm:$Ii),
 "z = vmem($Rx32++#$Ii)",
-tc_a0dbea28, TypeCVI_ZW>, Enc_6c9ee0, Requires<[UseHVXV66,UseZReg]> {
+tc_a0dbea28, TypeCVI_ZW>, Enc_6c9ee0, Requires<[UseHVXV66,UseZReg]>, PostInc_BaseImm {
 let Inst{7-0} = 0b00000000;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101101000;
@@ -38669,6 +39660,7 @@ let addrMode = PostInc;
 let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
+let CextOpcode = "V6_zLd";
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -38782,6 +39774,17 @@ let Inst{13-0} = 0b00000000000000;
 let Inst{31-16} = 0b0110110000100000;
 let isSolo = 1;
 }
+def Y2_crswap_old : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in),
+"crswap($Rx32,sgp)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
 def Y2_dccleana : HInst<
 (outs),
 (ins IntRegs:$Rs32),
@@ -38861,6 +39864,22 @@ let Inst{13-0} = 0b00000000000010;
 let Inst{31-16} = 0b0101011111000000;
 let isSolo = 1;
 }
+def Y2_k1lock_map : HInst<
+(outs),
+(ins),
+"k1lock",
+PSEUDO, TypeMAPPING>, Requires<[HasV65]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def Y2_k1unlock_map : HInst<
+(outs),
+(ins),
+"k1unlock",
+PSEUDO, TypeMAPPING>, Requires<[HasV65]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
 def Y2_syncht : HInst<
 (outs),
 (ins),
@@ -39083,7 +40102,7 @@ def dup_A2_add : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = add($Rs32,$Rt32)",
-tc_388f9897, TypeALU32_3op>, Requires<[HasV68]> {
+tc_388f9897, TypeALU32_3op>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let AsmVariantName = "NonParsable";
@@ -39093,7 +40112,7 @@ def dup_A2_addi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = add($Rs32,#$Ii)",
-tc_388f9897, TypeALU32_ADDI>, Requires<[HasV68]> {
+tc_388f9897, TypeALU32_ADDI>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let AsmVariantName = "NonParsable";
@@ -39108,7 +40127,7 @@ def dup_A2_andir : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = and($Rs32,#$Ii)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let AsmVariantName = "NonParsable";
@@ -39123,7 +40142,7 @@ def dup_A2_combineii : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins s32_0Imm:$Ii, s8_0Imm:$II),
 "$Rdd32 = combine(#$Ii,#$II)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> {
 let AsmVariantName = "NonParsable";
 let isPseudo = 1;
 let isExtendable = 1;
@@ -39136,7 +40155,7 @@ def dup_A2_sxtb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sxtb($Rs32)",
-tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> {
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let AsmVariantName = "NonParsable";
@@ -39146,7 +40165,7 @@ def dup_A2_sxth : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sxth($Rs32)",
-tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> {
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let AsmVariantName = "NonParsable";
@@ -39156,7 +40175,7 @@ def dup_A2_tfr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = $Rs32",
-tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> {
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let AsmVariantName = "NonParsable";
@@ -39166,7 +40185,7 @@ def dup_A2_tfrsi : HInst<
 (outs IntRegs:$Rd32),
 (ins s32_0Imm:$Ii),
 "$Rd32 = #$Ii",
-tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> {
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let AsmVariantName = "NonParsable";
@@ -39181,7 +40200,7 @@ def dup_A2_zxtb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = zxtb($Rs32)",
-PSEUDO, TypeMAPPING>, Requires<[HasV68]> {
+PSEUDO, TypeMAPPING>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let AsmVariantName = "NonParsable";
@@ -39191,7 +40210,7 @@ def dup_A2_zxth : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = zxth($Rs32)",
-tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> {
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let AsmVariantName = "NonParsable";
@@ -39201,7 +40220,7 @@ def dup_A4_combineii : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins s8_0Imm:$Ii, u32_0Imm:$II),
 "$Rdd32 = combine(#$Ii,#$II)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> {
 let AsmVariantName = "NonParsable";
 let isPseudo = 1;
 let isExtendable = 1;
@@ -39214,7 +40233,7 @@ def dup_A4_combineir : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins s32_0Imm:$Ii, IntRegs:$Rs32),
 "$Rdd32 = combine(#$Ii,$Rs32)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> {
 let AsmVariantName = "NonParsable";
 let isPseudo = 1;
 let isExtendable = 1;
@@ -39227,7 +40246,7 @@ def dup_A4_combineri : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rdd32 = combine($Rs32,#$Ii)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> {
 let AsmVariantName = "NonParsable";
 let isPseudo = 1;
 let isExtendable = 1;
@@ -39240,7 +40259,7 @@ def dup_C2_cmoveif : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if (!$Pu4) $Rd32 = #$Ii",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> {
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let hasNewValue = 1;
@@ -39257,7 +40276,7 @@ def dup_C2_cmoveit : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if ($Pu4) $Rd32 = #$Ii",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> {
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -39273,7 +40292,7 @@ def dup_C2_cmovenewif : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if (!$Pu4.new) $Rd32 = #$Ii",
-tc_4ac61d92, TypeALU32_2op>, Requires<[HasV68]> {
+tc_4ac61d92, TypeALU32_2op>, Requires<[HasV69]> {
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let hasNewValue = 1;
@@ -39291,7 +40310,7 @@ def dup_C2_cmovenewit : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if ($Pu4.new) $Rd32 = #$Ii",
-tc_4ac61d92, TypeALU32_2op>, Requires<[HasV68]> {
+tc_4ac61d92, TypeALU32_2op>, Requires<[HasV69]> {
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -39308,7 +40327,7 @@ def dup_C2_cmpeqi : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = cmp.eq($Rs32,#$Ii)",
-tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> {
+tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> {
 let AsmVariantName = "NonParsable";
 let isPseudo = 1;
 let isExtendable = 1;
@@ -39321,7 +40340,7 @@ def dup_L2_deallocframe : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = deallocframe($Rs32):raw",
-tc_aee6250c, TypeLD>, Requires<[HasV68]> {
+tc_aee6250c, TypeLD>, Requires<[HasV69]> {
 let accessSize = DoubleWordAccess;
 let AsmVariantName = "NonParsable";
 let mayLoad = 1;
@@ -39333,7 +40352,7 @@ def dup_L2_loadrb_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = memb($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV68]> {
+tc_eed07714, TypeLD>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let addrMode = BaseImmOffset;
@@ -39351,7 +40370,7 @@ def dup_L2_loadrd_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, s29_3Imm:$Ii),
 "$Rdd32 = memd($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV68]> {
+tc_eed07714, TypeLD>, Requires<[HasV69]> {
 let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let AsmVariantName = "NonParsable";
@@ -39367,7 +40386,7 @@ def dup_L2_loadrh_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Rd32 = memh($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV68]> {
+tc_eed07714, TypeLD>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let addrMode = BaseImmOffset;
@@ -39385,7 +40404,7 @@ def dup_L2_loadri_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii),
 "$Rd32 = memw($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV68]> {
+tc_eed07714, TypeLD>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let addrMode = BaseImmOffset;
@@ -39403,7 +40422,7 @@ def dup_L2_loadrub_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = memub($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV68]> {
+tc_eed07714, TypeLD>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let addrMode = BaseImmOffset;
@@ -39421,7 +40440,7 @@ def dup_L2_loadruh_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Rd32 = memuh($Rs32+#$Ii)",
-tc_eed07714, TypeLD>, Requires<[HasV68]> {
+tc_eed07714, TypeLD>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let addrMode = BaseImmOffset;
@@ -39439,7 +40458,7 @@ def dup_S2_allocframe : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, u11_3Imm:$Ii),
 "allocframe($Rx32,#$Ii):raw",
-tc_74a42bda, TypeST>, Requires<[HasV68]> {
+tc_74a42bda, TypeST>, Requires<[HasV69]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let addrMode = BaseImmOffset;
@@ -39455,7 +40474,7 @@ def dup_S2_storerb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+#$Ii) = $Rt32",
-tc_a9edeffa, TypeST>, Requires<[HasV68]> {
+tc_a9edeffa, TypeST>, Requires<[HasV69]> {
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let AsmVariantName = "NonParsable";
@@ -39471,7 +40490,7 @@ def dup_S2_storerd_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "memd($Rs32+#$Ii) = $Rtt32",
-tc_a9edeffa, TypeST>, Requires<[HasV68]> {
+tc_a9edeffa, TypeST>, Requires<[HasV69]> {
 let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let AsmVariantName = "NonParsable";
@@ -39487,7 +40506,7 @@ def dup_S2_storerh_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) = $Rt32",
-tc_a9edeffa, TypeST>, Requires<[HasV68]> {
+tc_a9edeffa, TypeST>, Requires<[HasV69]> {
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let AsmVariantName = "NonParsable";
@@ -39503,7 +40522,7 @@ def dup_S2_storeri_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+#$Ii) = $Rt32",
-tc_a9edeffa, TypeST>, Requires<[HasV68]> {
+tc_a9edeffa, TypeST>, Requires<[HasV69]> {
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let AsmVariantName = "NonParsable";
@@ -39519,7 +40538,7 @@ def dup_S4_storeirb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
 "memb($Rs32+#$Ii) = #$II",
-tc_838c4d7a, TypeV4LDST>, Requires<[HasV68]> {
+tc_838c4d7a, TypeV4LDST>, Requires<[HasV69]> {
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let AsmVariantName = "NonParsable";
@@ -39535,7 +40554,7 @@ def dup_S4_storeiri_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
 "memw($Rs32+#$Ii) = #$II",
-tc_838c4d7a, TypeV4LDST>, Requires<[HasV68]> {
+tc_838c4d7a, TypeV4LDST>, Requires<[HasV69]> {
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let AsmVariantName = "NonParsable";
diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index e5c78d122c9e..64bc5091d1d1 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -1661,8 +1661,6 @@ def: Pat<(int_hexagon_Y2_dccleana IntRegs:$src1),
          (Y2_dccleana IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_Y2_dccleaninva IntRegs:$src1),
          (Y2_dccleaninva IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_Y2_dcfetch IntRegs:$src1),
-         (Y2_dcfetch IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_Y2_dcinva IntRegs:$src1),
          (Y2_dcinva IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_Y2_dczeroa IntRegs:$src1),
@@ -3380,3 +3378,294 @@ def: Pat<(int_hexagon_V6_v6mpyvubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3,
          (V6_v6mpyvubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV68, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_v6mpyvubs10_vxx_128B HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4),
          (V6_v6mpyvubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabs_hf HvxVR:$src1),
+         (V6_vabs_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabs_hf_128B HvxVR:$src1),
+         (V6_vabs_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabs_sf HvxVR:$src1),
+         (V6_vabs_sf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabs_sf_128B HvxVR:$src1),
+         (V6_vabs_sf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadd_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_hf_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadd_hf_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadd_qf16 HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_qf16_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_qf16_mix HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_qf16_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_qf16_mix_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_qf16_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_qf32 HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_qf32_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_qf32_mix HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_qf32_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_qf32_mix_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_qf32_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_sf HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_sf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vadd_sf_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadd_sf_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadd_sf_sf HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadd_sf_sf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadd_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vassign_fp HvxVR:$src1),
+         (V6_vassign_fp HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vassign_fp_128B HvxVR:$src1),
+         (V6_vassign_fp HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vconv_hf_qf16 HvxVR:$src1),
+         (V6_vconv_hf_qf16 HvxVR:$src1)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_hf_qf16_128B HvxVR:$src1),
+         (V6_vconv_hf_qf16 HvxVR:$src1)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_hf_qf32 HvxWR:$src1),
+         (V6_vconv_hf_qf32 HvxWR:$src1)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_hf_qf32_128B HvxWR:$src1),
+         (V6_vconv_hf_qf32 HvxWR:$src1)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_sf_qf32 HvxVR:$src1),
+         (V6_vconv_sf_qf32 HvxVR:$src1)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vconv_sf_qf32_128B HvxVR:$src1),
+         (V6_vconv_sf_qf32 HvxVR:$src1)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vcvt_b_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vcvt_b_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcvt_b_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vcvt_b_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcvt_h_hf HvxVR:$src1),
+         (V6_vcvt_h_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcvt_h_hf_128B HvxVR:$src1),
+         (V6_vcvt_h_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcvt_hf_b HvxVR:$src1),
+         (V6_vcvt_hf_b HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcvt_hf_b_128B HvxVR:$src1),
+         (V6_vcvt_hf_b HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcvt_hf_h HvxVR:$src1),
+         (V6_vcvt_hf_h HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcvt_hf_h_128B HvxVR:$src1),
+         (V6_vcvt_hf_h HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcvt_hf_sf HvxVR:$src1, HvxVR:$src2),
+         (V6_vcvt_hf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcvt_hf_sf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vcvt_hf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcvt_hf_ub HvxVR:$src1),
+         (V6_vcvt_hf_ub HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcvt_hf_ub_128B HvxVR:$src1),
+         (V6_vcvt_hf_ub HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcvt_hf_uh HvxVR:$src1),
+         (V6_vcvt_hf_uh HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcvt_hf_uh_128B HvxVR:$src1),
+         (V6_vcvt_hf_uh HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcvt_sf_hf HvxVR:$src1),
+         (V6_vcvt_sf_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcvt_sf_hf_128B HvxVR:$src1),
+         (V6_vcvt_sf_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcvt_ub_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vcvt_ub_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcvt_ub_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vcvt_ub_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcvt_uh_hf HvxVR:$src1),
+         (V6_vcvt_uh_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcvt_uh_hf_128B HvxVR:$src1),
+         (V6_vcvt_uh_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpy_sf_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vdmpy_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpy_sf_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vdmpy_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpy_sf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vdmpy_sf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpy_sf_hf_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vdmpy_sf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vfmax_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vfmax_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vfmax_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vfmax_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vfmax_sf HvxVR:$src1, HvxVR:$src2),
+         (V6_vfmax_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vfmax_sf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vfmax_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vfmin_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vfmin_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vfmin_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vfmin_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vfmin_sf HvxVR:$src1, HvxVR:$src2),
+         (V6_vfmin_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vfmin_sf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vfmin_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vfneg_hf HvxVR:$src1),
+         (V6_vfneg_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vfneg_hf_128B HvxVR:$src1),
+         (V6_vfneg_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vfneg_sf HvxVR:$src1),
+         (V6_vfneg_sf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vfneg_sf_128B HvxVR:$src1),
+         (V6_vfneg_sf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgthf HvxVR:$src1, HvxVR:$src2),
+         (V6_vgthf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgthf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgthf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgthf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgthf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgthf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgthf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgthf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgthf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgthf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgthf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgthf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgthf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgthf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgthf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgtsf HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtsf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgtsf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtsf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgtsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgtsf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgtsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgtsf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgtsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vgtsf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmax_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmax_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmax_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmax_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmax_sf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmax_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmax_sf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmax_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmin_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmin_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmin_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmin_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmin_sf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmin_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmin_sf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmin_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_hf_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpy_hf_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpy_hf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpy_hf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpy_hf_hf_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpy_hf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpy_qf16 HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf16_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf16_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf16_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf16_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf16_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf16_mix_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf16_mix_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf16_mix_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf16_mix_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf32 HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf32_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf32_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf32_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf32_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf32_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf32_mix_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf32_mix_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf32_mix_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf32_mix_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf32_qf16 HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf32_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf32_qf16_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf32_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf32_sf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf32_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_qf32_sf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_qf32_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vmpy_sf_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpy_sf_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpy_sf_hf_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpy_sf_hf_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpy_sf_hf_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpy_sf_hf_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpy_sf_sf HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpy_sf_sf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpy_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsub_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_hf_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsub_hf_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsub_qf16 HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_qf16_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_qf16_mix HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_qf16_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_qf16_mix_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_qf16_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_qf32 HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_qf32_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_qf32_mix HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_qf32_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_qf32_mix_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_qf32_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_sf HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_sf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_sf_hf HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsub_sf_hf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsub_sf_sf HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsub_sf_sf_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>;
+
+// V69 HVX Instructions.
+
+def: Pat<(int_hexagon_V6_vasrvuhubrndsat HvxWR:$src1, HvxVR:$src2),
+         (V6_vasrvuhubrndsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrvuhubrndsat_128B HvxWR:$src1, HvxVR:$src2),
+         (V6_vasrvuhubrndsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrvuhubsat HvxWR:$src1, HvxVR:$src2),
+         (V6_vasrvuhubsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrvuhubsat_128B HvxWR:$src1, HvxVR:$src2),
+         (V6_vasrvuhubsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrvwuhrndsat HvxWR:$src1, HvxVR:$src2),
+         (V6_vasrvwuhrndsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrvwuhrndsat_128B HvxWR:$src1, HvxVR:$src2),
+         (V6_vasrvwuhrndsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrvwuhsat HvxWR:$src1, HvxVR:$src2),
+         (V6_vasrvwuhsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrvwuhsat_128B HvxWR:$src1, HvxVR:$src2),
+         (V6_vasrvwuhsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuhvs HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyuhvs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuhvs_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyuhvs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX128B]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonDepMappings.td b/llvm/lib/Target/Hexagon/HexagonDepMappings.td
index 919cb996ad15..2f7b76b893a9 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepMappings.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepMappings.td
@@ -174,7 +174,6 @@ def V6_ldcpnt0Alias : InstAlias<"if ($Pv4) $Vd32.cur = vmem($Rt32):nt", (V6_vL32
 def V6_ldnp0Alias : InstAlias<"if (!$Pv4) $Vd32 = vmem($Rt32)", (V6_vL32b_npred_pi HvxVR:$Vd32, IntRegs:$Rt32, PredRegs:$Pv4, 0)>, Requires<[UseHVX]>;
 def V6_ldnpnt0Alias : InstAlias<"if (!$Pv4) $Vd32 = vmem($Rt32):nt", (V6_vL32b_nt_npred_pi HvxVR:$Vd32, IntRegs:$Rt32, PredRegs:$Pv4, 0)>, Requires<[UseHVX]>;
 def V6_ldnt0Alias : InstAlias<"$Vd32 = vmem($Rt32):nt", (V6_vL32b_nt_ai HvxVR:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
-def V6_ldntnt0Alias : InstAlias<"$Vd32 = vmem($Rt32):nt", (V6_vL32b_nt_ai HvxVR:$Vd32, IntRegs:$Rt32, 0)>;
 def V6_ldp0Alias : InstAlias<"if ($Pv4) $Vd32 = vmem($Rt32)", (V6_vL32b_pred_ai HvxVR:$Vd32, PredRegs:$Pv4, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
 def V6_ldpnt0Alias : InstAlias<"if ($Pv4) $Vd32 = vmem($Rt32):nt", (V6_vL32b_nt_pred_ai HvxVR:$Vd32, PredRegs:$Pv4, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
 def V6_ldtnp0Alias : InstAlias<"if (!$Pv4) $Vd32.tmp = vmem($Rt32)", (V6_vL32b_npred_ai HvxVR:$Vd32, PredRegs:$Pv4, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 46c1fbc6eeb2..85230cac9d7c 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -1445,8 +1445,8 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
 
   MachineBasicBlock *B = N->getBlock();
   std::vector<MachineInstr*> Instrs;
-  for (auto I = B->rbegin(), E = B->rend(); I != E; ++I)
-    Instrs.push_back(&*I);
+  for (MachineInstr &MI : llvm::reverse(*B))
+    Instrs.push_back(&MI);
 
   for (MachineInstr *MI : Instrs) {
     unsigned Opc = MI->getOpcode();
diff --git a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
index e45126bec6ef..44679d429de5 100644
--- a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
@@ -60,7 +60,7 @@ HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) {
         RetVal = NoHazard;
       LLVM_DEBUG(dbgs() << "*** Try .new version? " << (RetVal == NoHazard)
                         << "\n");
-      MF->DeleteMachineInstr(NewMI);
+      MF->deleteMachineInstr(NewMI);
     }
     return RetVal;
   }
@@ -129,7 +129,7 @@ void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) {
                                MI->getDebugLoc());
     assert(Resources->canReserveResources(*NewMI));
     Resources->reserveResources(*NewMI);
-    MF->DeleteMachineInstr(NewMI);
+    MF->deleteMachineInstr(NewMI);
   }
   else
     Resources->reserveResources(*MI);
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
index 45adaf50774f..898ef51bd48f 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -146,9 +146,6 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   bits<1> isFP = 0;
   let TSFlags {50} = isFP; // Floating-point.
 
-  bits<1> isSomeOK = 0;
-  let TSFlags {51} = isSomeOK; // Relax some grouping constraints.
-
   bits<1> hasNewValue2 = 0;
   let TSFlags{52} = hasNewValue2; // Second New-value producer insn.
   bits<3> opNewValue2 = 0;
@@ -160,8 +157,8 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   bits<1> prefersSlot3 = 0;
   let TSFlags{57} = prefersSlot3; // Complex XU
 
-  bits<1> hasTmpDst = 0;
-  let TSFlags{60} = hasTmpDst;  // v65 : 'fake" register VTMP is set
+  bits<1> hasHvxTmp = 0;
+  let TSFlags{60} = hasHvxTmp;  // vector register vX.tmp false-write
 
   bit CVINew = 0;
   let TSFlags{62} = CVINew;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index b6984d40f78e..931b0c0e0090 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -40,6 +40,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -4655,3 +4656,11 @@ short HexagonInstrInfo::changeAddrMode_rr_ur(short Opc) const {
 short HexagonInstrInfo::changeAddrMode_ur_rr(short Opc) const {
   return Opc >= 0 ? Hexagon::changeAddrMode_ur_rr(Opc) : Opc;
 }
+
+MCInst HexagonInstrInfo::getNop() const {
+  static const MCInst Nop = MCInstBuilder(Hexagon::A2_nop);
+
+  return MCInstBuilder(Hexagon::BUNDLE)
+    .addImm(0)
+    .addInst(&Nop);
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index eaaf9f7046c7..830f04d9eac3 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -524,6 +524,8 @@ public:
   short changeAddrMode_ur_rr(const MachineInstr &MI) const {
     return changeAddrMode_ur_rr(MI.getOpcode());
   }
+
+  MCInst getNop() const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 987c4a5fa6c4..d5c34ac467c3 100644
--- a/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -104,6 +104,19 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
     HexagonMCInstrInfo::setOuterLoop(MCB);
     return;
   }
+  if (MI->getOpcode() == Hexagon::PATCHABLE_FUNCTION_ENTER) {
+    AP.EmitSled(*MI, HexagonAsmPrinter::SledKind::FUNCTION_ENTER);
+    return;
+  }
+  if (MI->getOpcode() == Hexagon::PATCHABLE_FUNCTION_EXIT) {
+    AP.EmitSled(*MI, HexagonAsmPrinter::SledKind::FUNCTION_EXIT);
+    return;
+  }
+  if (MI->getOpcode() == Hexagon::PATCHABLE_TAIL_CALL) {
+    AP.EmitSled(*MI, HexagonAsmPrinter::SledKind::TAIL_CALL);
+    return;
+  }
+
   MCInst *MCI = AP.OutContext.createMCInst();
   MCI->setOpcode(MI->getOpcode());
   assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
diff --git a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 60d58f421bbb..53e82ac66b85 100644
--- a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -14,676 +14,44 @@
 #include "HexagonMachineScheduler.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/DFAPacketizer.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <iomanip>
-#include <limits>
-#include <memory>
-#include <sstream>
+#include "llvm/CodeGen/VLIWMachineScheduler.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "machine-scheduler"
 
-static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure",
-    cl::Hidden, cl::ZeroOrMore, cl::init(false));
-
-static cl::opt<bool> UseNewerCandidate("use-newer-candidate",
-    cl::Hidden, cl::ZeroOrMore, cl::init(true));
-
-static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level",
-    cl::Hidden, cl::ZeroOrMore, cl::init(1));
-
-// Check if the scheduler should penalize instructions that are available to
-// early due to a zero-latency dependence.
-static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden,
-    cl::ZeroOrMore, cl::init(true));
-
-// This value is used to determine if a register class is a high pressure set.
-// We compute the maximum number of registers needed and divided by the total
-// available. Then, we compare the result to this value.
-static cl::opt<float> RPThreshold("hexagon-reg-pressure", cl::Hidden,
-    cl::init(0.75f), cl::desc("High register pressure threhold."));
-
 /// Return true if there is a dependence between SUd and SUu.
-static bool hasDependence(const SUnit *SUd, const SUnit *SUu,
-                          const HexagonInstrInfo &QII) {
-  if (SUd->Succs.size() == 0)
-    return false;
+bool HexagonVLIWResourceModel::hasDependence(const SUnit *SUd,
+                                             const SUnit *SUu) {
+  const auto *QII = static_cast<const HexagonInstrInfo *>(TII);
 
   // Enable .cur formation.
-  if (QII.mayBeCurLoad(*SUd->getInstr()))
+  if (QII->mayBeCurLoad(*SUd->getInstr()))
     return false;
 
-  if (QII.canExecuteInBundle(*SUd->getInstr(), *SUu->getInstr()))
-    return false;
-
-  for (const auto &S : SUd->Succs) {
-    // Since we do not add pseudos to packets, might as well
-    // ignore order dependencies.
-    if (S.isCtrl())
-      continue;
-
-    if (S.getSUnit() == SUu && S.getLatency() > 0)
-      return true;
-  }
-  return false;
-}
-
-/// Check if scheduling of this SU is possible
-/// in the current packet.
-/// It is _not_ precise (statefull), it is more like
-/// another heuristic. Many corner cases are figured
-/// empirically.
-bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) {
-  if (!SU || !SU->getInstr())
+  if (QII->canExecuteInBundle(*SUd->getInstr(), *SUu->getInstr()))
     return false;
 
-  // First see if the pipeline could receive this instruction
-  // in the current cycle.
-  switch (SU->getInstr()->getOpcode()) {
-  default:
-    if (!ResourcesModel->canReserveResources(*SU->getInstr()))
-      return false;
-    break;
-  case TargetOpcode::EXTRACT_SUBREG:
-  case TargetOpcode::INSERT_SUBREG:
-  case TargetOpcode::SUBREG_TO_REG:
-  case TargetOpcode::REG_SEQUENCE:
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::COPY:
-  case TargetOpcode::INLINEASM:
-  case TargetOpcode::INLINEASM_BR:
-    break;
-  }
-
-  MachineBasicBlock *MBB = SU->getInstr()->getParent();
-  auto &QST = MBB->getParent()->getSubtarget<HexagonSubtarget>();
-  const auto &QII = *QST.getInstrInfo();
-
-  // Now see if there are no other dependencies to instructions already
-  // in the packet. 
-  if (IsTop) {
-    for (unsigned i = 0, e = Packet.size(); i != e; ++i)
-      if (hasDependence(Packet[i], SU, QII))
-        return false;
-  } else {
-    for (unsigned i = 0, e = Packet.size(); i != e; ++i)
-      if (hasDependence(SU, Packet[i], QII))
-        return false;
-  }
-  return true;
-}
-
-/// Keep track of available resources.
-bool VLIWResourceModel::reserveResources(SUnit *SU, bool IsTop) {
-  bool startNewCycle = false;
-  // Artificially reset state.
-  if (!SU) {
-    ResourcesModel->clearResources();
-    Packet.clear();
-    TotalPackets++;
-    return false;
-  }
-  // If this SU does not fit in the packet or the packet is now full
-  // start a new one.
-  if (!isResourceAvailable(SU, IsTop) ||
-      Packet.size() >= SchedModel->getIssueWidth()) {
-    ResourcesModel->clearResources();
-    Packet.clear();
-    TotalPackets++;
-    startNewCycle = true;
-  }
-
-  switch (SU->getInstr()->getOpcode()) {
-  default:
-    ResourcesModel->reserveResources(*SU->getInstr());
-    break;
-  case TargetOpcode::EXTRACT_SUBREG:
-  case TargetOpcode::INSERT_SUBREG:
-  case TargetOpcode::SUBREG_TO_REG:
-  case TargetOpcode::REG_SEQUENCE:
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-  case TargetOpcode::CFI_INSTRUCTION:
-  case TargetOpcode::EH_LABEL:
-  case TargetOpcode::COPY:
-  case TargetOpcode::INLINEASM:
-  case TargetOpcode::INLINEASM_BR:
-    break;
-  }
-  Packet.push_back(SU);
-
-#ifndef NDEBUG
-  LLVM_DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
-  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
-    LLVM_DEBUG(dbgs() << "\t[" << i << "] SU(");
-    LLVM_DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
-    LLVM_DEBUG(Packet[i]->getInstr()->dump());
-  }
-#endif
-
-  return startNewCycle;
+  return VLIWResourceModel::hasDependence(SUd, SUu);
 }
 
-/// schedule - Called back from MachineScheduler::runOnMachineFunction
-/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
-/// only includes instructions that have DAG nodes, not scheduling boundaries.
-void VLIWMachineScheduler::schedule() {
-  LLVM_DEBUG(dbgs() << "********** MI Converging Scheduling VLIW "
-                    << printMBBReference(*BB) << " " << BB->getName()
-                    << " in_func " << BB->getParent()->getName()
-                    << " at loop depth " << MLI->getLoopDepth(BB) << " \n");
-
-  buildDAGWithRegPressure();
-
-  Topo.InitDAGTopologicalSorting();
-
-  // Postprocess the DAG to add platform-specific artificial dependencies.
-  postprocessDAG();
-
-  SmallVector<SUnit*, 8> TopRoots, BotRoots;
-  findRootsAndBiasEdges(TopRoots, BotRoots);
-
-  // Initialize the strategy before modifying the DAG.
-  SchedImpl->initialize(this);
-
-  LLVM_DEBUG(unsigned maxH = 0;
-             for (unsigned su = 0, e = SUnits.size(); su != e;
-                  ++su) if (SUnits[su].getHeight() > maxH) maxH =
-                 SUnits[su].getHeight();
-             dbgs() << "Max Height " << maxH << "\n";);
-  LLVM_DEBUG(unsigned maxD = 0;
-             for (unsigned su = 0, e = SUnits.size(); su != e;
-                  ++su) if (SUnits[su].getDepth() > maxD) maxD =
-                 SUnits[su].getDepth();
-             dbgs() << "Max Depth " << maxD << "\n";);
-  LLVM_DEBUG(dump());
-
-  initQueues(TopRoots, BotRoots);
-
-  bool IsTopNode = false;
-  while (true) {
-    LLVM_DEBUG(
-        dbgs() << "** VLIWMachineScheduler::schedule picking next node\n");
-    SUnit *SU = SchedImpl->pickNode(IsTopNode);
-    if (!SU) break;
-
-    if (!checkSchedLimit())
-      break;
-
-    scheduleMI(SU, IsTopNode);
-
-    // Notify the scheduling strategy after updating the DAG.
-    SchedImpl->schedNode(SU, IsTopNode);
-
-    updateQueues(SU, IsTopNode);
-  }
-  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
-
-  placeDebugValues();
-
-  LLVM_DEBUG({
-    dbgs() << "*** Final schedule for "
-           << printMBBReference(*begin()->getParent()) << " ***\n";
-    dumpSchedule();
-    dbgs() << '\n';
-  });
+VLIWResourceModel *HexagonConvergingVLIWScheduler::createVLIWResourceModel(
+    const TargetSubtargetInfo &STI, const TargetSchedModel *SchedModel) const {
+  return new HexagonVLIWResourceModel(STI, SchedModel);
 }
 
-void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
-  DAG = static_cast<VLIWMachineScheduler*>(dag);
-  SchedModel = DAG->getSchedModel();
-
-  Top.init(DAG, SchedModel);
-  Bot.init(DAG, SchedModel);
-
-  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
-  // are disabled, then these HazardRecs will be disabled.
-  const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries();
-  const TargetSubtargetInfo &STI = DAG->MF.getSubtarget();
-  const TargetInstrInfo *TII = STI.getInstrInfo();
-  delete Top.HazardRec;
-  delete Bot.HazardRec;
-  Top.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
-  Bot.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
-
-  delete Top.ResourceModel;
-  delete Bot.ResourceModel;
-  Top.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
-  Bot.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
-
-  const std::vector<unsigned> &MaxPressure =
-    DAG->getRegPressure().MaxSetPressure;
-  HighPressureSets.assign(MaxPressure.size(), 0);
-  for (unsigned i = 0, e = MaxPressure.size(); i < e; ++i) {
-    unsigned Limit = DAG->getRegClassInfo()->getRegPressureSetLimit(i);
-    HighPressureSets[i] =
-      ((float) MaxPressure[i] > ((float) Limit * RPThreshold));
-  }
-
-  assert((!ForceTopDown || !ForceBottomUp) &&
-         "-misched-topdown incompatible with -misched-bottomup");
-}
-
-void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
-  for (const SDep &PI : SU->Preds) {
-    unsigned PredReadyCycle = PI.getSUnit()->TopReadyCycle;
-    unsigned MinLatency = PI.getLatency();
-#ifndef NDEBUG
-    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
-#endif
-    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
-      SU->TopReadyCycle = PredReadyCycle + MinLatency;
-  }
-
-  if (!SU->isScheduled)
-    Top.releaseNode(SU, SU->TopReadyCycle);
-}
-
-void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
-  assert(SU->getInstr() && "Scheduled SUnit must have instr");
-
-  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
-    unsigned MinLatency = I->getLatency();
-#ifndef NDEBUG
-    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
-#endif
-    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
-      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
-  }
-
-  if (!SU->isScheduled)
-    Bot.releaseNode(SU, SU->BotReadyCycle);
-}
-
-/// Does this SU have a hazard within the current instruction group.
-///
-/// The scheduler supports two modes of hazard recognition. The first is the
-/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
-/// supports highly complicated in-order reservation tables
-/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
-///
-/// The second is a streamlined mechanism that checks for hazards based on
-/// simple counters that the scheduler itself maintains. It explicitly checks
-/// for instruction dispatch limitations, including the number of micro-ops that
-/// can dispatch per cycle.
-///
-/// TODO: Also check whether the SU must start a new group.
-bool ConvergingVLIWScheduler::VLIWSchedBoundary::checkHazard(SUnit *SU) {
-  if (HazardRec->isEnabled())
-    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
-
-  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
-  if (IssueCount + uops > SchedModel->getIssueWidth())
-    return true;
-
-  return false;
-}
-
-void ConvergingVLIWScheduler::VLIWSchedBoundary::releaseNode(SUnit *SU,
-                                                     unsigned ReadyCycle) {
-  if (ReadyCycle < MinReadyCycle)
-    MinReadyCycle = ReadyCycle;
-
-  // Check for interlocks first. For the purpose of other heuristics, an
-  // instruction that cannot issue appears as if it's not in the ReadyQueue.
-  if (ReadyCycle > CurrCycle || checkHazard(SU))
-
-    Pending.push(SU);
-  else
-    Available.push(SU);
-}
-
-/// Move the boundary of scheduled code by one cycle.
-void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() {
-  unsigned Width = SchedModel->getIssueWidth();
-  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
-
-  assert(MinReadyCycle < std::numeric_limits<unsigned>::max() &&
-         "MinReadyCycle uninitialized");
-  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
-
-  if (!HazardRec->isEnabled()) {
-    // Bypass HazardRec virtual calls.
-    CurrCycle = NextCycle;
-  } else {
-    // Bypass getHazardType calls in case of long latency.
-    for (; CurrCycle != NextCycle; ++CurrCycle) {
-      if (isTop())
-        HazardRec->AdvanceCycle();
-      else
-        HazardRec->RecedeCycle();
-    }
-  }
-  CheckPending = true;
-
-  LLVM_DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle "
-                    << CurrCycle << '\n');
-}
-
-/// Move the boundary of scheduled code by one SUnit.
-void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) {
-  bool startNewCycle = false;
-
-  // Update the reservation table.
-  if (HazardRec->isEnabled()) {
-    if (!isTop() && SU->isCall) {
-      // Calls are scheduled with their preceding instructions. For bottom-up
-      // scheduling, clear the pipeline state before emitting.
-      HazardRec->Reset();
-    }
-    HazardRec->EmitInstruction(SU);
-  }
-
-  // Update DFA model.
-  startNewCycle = ResourceModel->reserveResources(SU, isTop());
-
-  // Check the instruction group dispatch limit.
-  // TODO: Check if this SU must end a dispatch group.
-  IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
-  if (startNewCycle) {
-    LLVM_DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
-    bumpCycle();
-  }
-  else
-    LLVM_DEBUG(dbgs() << "*** IssueCount " << IssueCount << " at cycle "
-                      << CurrCycle << '\n');
-}
-
-/// Release pending ready nodes in to the available queue. This makes them
-/// visible to heuristics.
-void ConvergingVLIWScheduler::VLIWSchedBoundary::releasePending() {
-  // If the available queue is empty, it is safe to reset MinReadyCycle.
-  if (Available.empty())
-    MinReadyCycle = std::numeric_limits<unsigned>::max();
-
-  // Check to see if any of the pending instructions are ready to issue.  If
-  // so, add them to the available queue.
-  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
-    SUnit *SU = *(Pending.begin()+i);
-    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
-
-    if (ReadyCycle < MinReadyCycle)
-      MinReadyCycle = ReadyCycle;
-
-    if (ReadyCycle > CurrCycle)
-      continue;
-
-    if (checkHazard(SU))
-      continue;
-
-    Available.push(SU);
-    Pending.remove(Pending.begin()+i);
-    --i; --e;
-  }
-  CheckPending = false;
-}
-
-/// Remove SU from the ready set for this boundary.
-void ConvergingVLIWScheduler::VLIWSchedBoundary::removeReady(SUnit *SU) {
-  if (Available.isInQueue(SU))
-    Available.remove(Available.find(SU));
-  else {
-    assert(Pending.isInQueue(SU) && "bad ready count");
-    Pending.remove(Pending.find(SU));
-  }
-}
+int HexagonConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
+                                                   SchedCandidate &Candidate,
+                                                   RegPressureDelta &Delta,
+                                                   bool verbose) {
+  int ResCount =
+      ConvergingVLIWScheduler::SchedulingCost(Q, SU, Candidate, Delta, verbose);
 
-/// If this queue only has one ready candidate, return it. As a side effect,
-/// advance the cycle until at least one node is ready. If multiple instructions
-/// are ready, return NULL.
-SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
-  if (CheckPending)
-    releasePending();
-
-  auto AdvanceCycle = [this]() {
-    if (Available.empty())
-      return true;
-    if (Available.size() == 1 && Pending.size() > 0)
-      return !ResourceModel->isResourceAvailable(*Available.begin(), isTop()) ||
-        getWeakLeft(*Available.begin(), isTop()) != 0;
-    return false;
-  };
-  for (unsigned i = 0; AdvanceCycle(); ++i) {
-    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
-           "permanent hazard"); (void)i;
-    ResourceModel->reserveResources(nullptr, isTop());
-    bumpCycle();
-    releasePending();
-  }
-  if (Available.size() == 1)
-    return *Available.begin();
-  return nullptr;
-}
-
-#ifndef NDEBUG
-void ConvergingVLIWScheduler::traceCandidate(const char *Label,
-      const ReadyQueue &Q, SUnit *SU, int Cost, PressureChange P) {
-  dbgs() << Label << " " << Q.getName() << " ";
-  if (P.isValid())
-    dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":"
-           << P.getUnitInc() << " ";
-  else
-    dbgs() << "     ";
-  dbgs() << "cost(" << Cost << ")\t";
-  DAG->dumpNode(*SU);
-}
-
-// Very detailed queue dump, to be used with higher verbosity levels.
-void ConvergingVLIWScheduler::readyQueueVerboseDump(
-      const RegPressureTracker &RPTracker, SchedCandidate &Candidate,
-      ReadyQueue &Q) {
-  RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
-
-  dbgs() << ">>> " << Q.getName() << "\n";
-  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
-    RegPressureDelta RPDelta;
-    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
-                                    DAG->getRegionCriticalPSets(),
-                                    DAG->getRegPressure().MaxSetPressure);
-    std::stringstream dbgstr;
-    dbgstr << "SU(" << std::setw(3) << (*I)->NodeNum << ")";
-    dbgs() << dbgstr.str();
-    SchedulingCost(Q, *I, Candidate, RPDelta, true);
-    dbgs() << "\t";
-    (*I)->getInstr()->dump();
-  }
-  dbgs() << "\n";
-}
-#endif
-
-/// isSingleUnscheduledPred - If SU2 is the only unscheduled predecessor
-/// of SU, return true (we may have duplicates)
-static inline bool isSingleUnscheduledPred(SUnit *SU, SUnit *SU2) {
-  if (SU->NumPredsLeft == 0)
-    return false;
-
-  for (auto &Pred : SU->Preds) {
-    // We found an available, but not scheduled, predecessor.
-    if (!Pred.getSUnit()->isScheduled && (Pred.getSUnit() != SU2))
-      return false;
-  }
-
-  return true;
-}
-
-/// isSingleUnscheduledSucc - If SU2 is the only unscheduled successor
-/// of SU, return true (we may have duplicates)
-static inline bool isSingleUnscheduledSucc(SUnit *SU, SUnit *SU2) {
-  if (SU->NumSuccsLeft == 0)
-    return false;
-
-  for (auto &Succ : SU->Succs) {
-    // We found an available, but not scheduled, successor.
-    if (!Succ.getSUnit()->isScheduled && (Succ.getSUnit() != SU2))
-      return false;
-  }
-  return true;
-}
-
-/// Check if the instruction changes the register pressure of a register in the
-/// high pressure set. The function returns a negative value if the pressure
-/// decreases and a positive value is the pressure increases. If the instruction
-/// doesn't use a high pressure register or doesn't change the register
-/// pressure, then return 0.
-int ConvergingVLIWScheduler::pressureChange(const SUnit *SU, bool isBotUp) {
-  PressureDiff &PD = DAG->getPressureDiff(SU);
-  for (auto &P : PD) {
-    if (!P.isValid())
-      continue;
-    // The pressure differences are computed bottom-up, so the comparision for
-    // an increase is positive in the bottom direction, but negative in the
-    //  top-down direction.
-    if (HighPressureSets[P.getPSet()])
-      return (isBotUp ? P.getUnitInc() : -P.getUnitInc());
-  }
-  return 0;
-}
-
-// Constants used to denote relative importance of
-// heuristic components for cost computation.
-static const unsigned PriorityOne = 200;
-static const unsigned PriorityTwo = 50;
-static const unsigned PriorityThree = 75;
-static const unsigned ScaleTwo = 10;
-
-/// Single point to compute overall scheduling cost.
-/// TODO: More heuristics will be used soon.
-int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
-                                            SchedCandidate &Candidate,
-                                            RegPressureDelta &Delta,
-                                            bool verbose) {
-  // Initial trivial priority.
-  int ResCount = 1;
-
-  // Do not waste time on a node that is already scheduled.
   if (!SU || SU->isScheduled)
     return ResCount;
 
-  LLVM_DEBUG(if (verbose) dbgs()
-             << ((Q.getID() == TopQID) ? "(top|" : "(bot|"));
-  // Forced priority is high.
-  if (SU->isScheduleHigh) {
-    ResCount += PriorityOne;
-    LLVM_DEBUG(dbgs() << "H|");
-  }
-
-  unsigned IsAvailableAmt = 0;
-  // Critical path first.
-  if (Q.getID() == TopQID) {
-    if (Top.isLatencyBound(SU)) {
-      LLVM_DEBUG(if (verbose) dbgs() << "LB|");
-      ResCount += (SU->getHeight() * ScaleTwo);
-    }
-
-    LLVM_DEBUG(if (verbose) {
-      std::stringstream dbgstr;
-      dbgstr << "h" << std::setw(3) << SU->getHeight() << "|";
-      dbgs() << dbgstr.str();
-    });
-
-    // If resources are available for it, multiply the
-    // chance of scheduling.
-    if (Top.ResourceModel->isResourceAvailable(SU, true)) {
-      IsAvailableAmt = (PriorityTwo + PriorityThree);
-      ResCount += IsAvailableAmt;
-      LLVM_DEBUG(if (verbose) dbgs() << "A|");
-    } else
-      LLVM_DEBUG(if (verbose) dbgs() << " |");
-  } else {
-    if (Bot.isLatencyBound(SU)) {
-      LLVM_DEBUG(if (verbose) dbgs() << "LB|");
-      ResCount += (SU->getDepth() * ScaleTwo);
-    }
-
-    LLVM_DEBUG(if (verbose) {
-      std::stringstream dbgstr;
-      dbgstr << "d" << std::setw(3) << SU->getDepth() << "|";
-      dbgs() << dbgstr.str();
-    });
-
-    // If resources are available for it, multiply the
-    // chance of scheduling.
-    if (Bot.ResourceModel->isResourceAvailable(SU, false)) {
-      IsAvailableAmt = (PriorityTwo + PriorityThree);
-      ResCount += IsAvailableAmt;
-      LLVM_DEBUG(if (verbose) dbgs() << "A|");
-    } else
-      LLVM_DEBUG(if (verbose) dbgs() << " |");
-  }
-
-  unsigned NumNodesBlocking = 0;
-  if (Q.getID() == TopQID) {
-    // How many SUs does it block from scheduling?
-    // Look at all of the successors of this node.
-    // Count the number of nodes that
-    // this node is the sole unscheduled node for.
-    if (Top.isLatencyBound(SU))
-      for (const SDep &SI : SU->Succs)
-        if (isSingleUnscheduledPred(SI.getSUnit(), SU))
-          ++NumNodesBlocking;
-  } else {
-    // How many unscheduled predecessors block this node?
-    if (Bot.isLatencyBound(SU))
-      for (const SDep &PI : SU->Preds)
-        if (isSingleUnscheduledSucc(PI.getSUnit(), SU))
-          ++NumNodesBlocking;
-  }
-  ResCount += (NumNodesBlocking * ScaleTwo);
-
-  LLVM_DEBUG(if (verbose) {
-    std::stringstream dbgstr;
-    dbgstr << "blk " << std::setw(2) << NumNodesBlocking << ")|";
-    dbgs() << dbgstr.str();
-  });
-
-  // Factor in reg pressure as a heuristic.
-  if (!IgnoreBBRegPressure) {
-    // Decrease priority by the amount that register pressure exceeds the limit.
-    ResCount -= (Delta.Excess.getUnitInc()*PriorityOne);
-    // Decrease priority if register pressure exceeds the limit.
-    ResCount -= (Delta.CriticalMax.getUnitInc()*PriorityOne);
-    // Decrease priority slightly if register pressure would increase over the
-    // current maximum.
-    ResCount -= (Delta.CurrentMax.getUnitInc()*PriorityTwo);
-    // If there are register pressure issues, then we remove the value added for
-    // the instruction being available. The rationale is that we really don't
-    // want to schedule an instruction that causes a spill.
-    if (IsAvailableAmt && pressureChange(SU, Q.getID() != TopQID) > 0 &&
-        (Delta.Excess.getUnitInc() || Delta.CriticalMax.getUnitInc() ||
-         Delta.CurrentMax.getUnitInc()))
-      ResCount -= IsAvailableAmt;
-    LLVM_DEBUG(if (verbose) {
-      dbgs() << "RP " << Delta.Excess.getUnitInc() << "/"
-             << Delta.CriticalMax.getUnitInc() << "/"
-             << Delta.CurrentMax.getUnitInc() << ")|";
-    });
-  }
-
-  // Give a little extra priority to a .cur instruction if there is a resource
-  // available for it.
   auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
   auto &QII = *QST.getInstrInfo();
   if (SU->isInstr() && QII.mayBeCurLoad(*SU->getInstr())) {
@@ -698,303 +66,5 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
     }
   }
 
-  // Give preference to a zero latency instruction if the dependent
-  // instruction is in the current packet.
-  if (Q.getID() == TopQID && getWeakLeft(SU, true) == 0) {
-    for (const SDep &PI : SU->Preds) {
-      if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() &&
-          PI.getLatency() == 0 &&
-          Top.ResourceModel->isInPacket(PI.getSUnit())) {
-        ResCount += PriorityThree;
-        LLVM_DEBUG(if (verbose) dbgs() << "Z|");
-      }
-    }
-  } else if (Q.getID() == BotQID && getWeakLeft(SU, false) == 0) {
-    for (const SDep &SI : SU->Succs) {
-      if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() &&
-          SI.getLatency() == 0 &&
-          Bot.ResourceModel->isInPacket(SI.getSUnit())) {
-        ResCount += PriorityThree;
-        LLVM_DEBUG(if (verbose) dbgs() << "Z|");
-      }
-    }
-  }
-
-  // If the instruction has a non-zero latency dependence with an instruction in
-  // the current packet, then it should not be scheduled yet. The case occurs
-  // when the dependent instruction is scheduled in a new packet, so the
-  // scheduler updates the current cycle and pending instructions become
-  // available.
-  if (CheckEarlyAvail) {
-    if (Q.getID() == TopQID) {
-      for (const auto &PI : SU->Preds) {
-        if (PI.getLatency() > 0 &&
-            Top.ResourceModel->isInPacket(PI.getSUnit())) {
-          ResCount -= PriorityOne;
-          LLVM_DEBUG(if (verbose) dbgs() << "D|");
-        }
-      }
-    } else {
-      for (const auto &SI : SU->Succs) {
-        if (SI.getLatency() > 0 &&
-            Bot.ResourceModel->isInPacket(SI.getSUnit())) {
-          ResCount -= PriorityOne;
-          LLVM_DEBUG(if (verbose) dbgs() << "D|");
-        }
-      }
-    }
-  }
-
-  LLVM_DEBUG(if (verbose) {
-    std::stringstream dbgstr;
-    dbgstr << "Total " << std::setw(4) << ResCount << ")";
-    dbgs() << dbgstr.str();
-  });
-
   return ResCount;
 }
-
-/// Pick the best candidate from the top queue.
-///
-/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
-/// DAG building. To adjust for the current scheduling location we need to
-/// maintain the number of vreg uses remaining to be top-scheduled.
-ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler::
-pickNodeFromQueue(VLIWSchedBoundary &Zone, const RegPressureTracker &RPTracker,
-                  SchedCandidate &Candidate) {
-  ReadyQueue &Q = Zone.Available;
-  LLVM_DEBUG(if (SchedDebugVerboseLevel > 1)
-                 readyQueueVerboseDump(RPTracker, Candidate, Q);
-             else Q.dump(););
-
-  // getMaxPressureDelta temporarily modifies the tracker.
-  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
-
-  // BestSU remains NULL if no top candidates beat the best existing candidate.
-  CandResult FoundCandidate = NoCand;
-  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
-    RegPressureDelta RPDelta;
-    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
-                                    DAG->getRegionCriticalPSets(),
-                                    DAG->getRegPressure().MaxSetPressure);
-
-    int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false);
-
-    // Initialize the candidate if needed.
-    if (!Candidate.SU) {
-      LLVM_DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      Candidate.SCost = CurrentCost;
-      FoundCandidate = NodeOrder;
-      continue;
-    }
-
-    // Choose node order for negative cost candidates. There is no good
-    // candidate in this case.
-    if (CurrentCost < 0 && Candidate.SCost < 0) {
-      if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
-          || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
-        LLVM_DEBUG(traceCandidate("NCAND", Q, *I, CurrentCost));
-        Candidate.SU = *I;
-        Candidate.RPDelta = RPDelta;
-        Candidate.SCost = CurrentCost;
-        FoundCandidate = NodeOrder;
-      }
-      continue;
-    }
-
-    // Best cost.
-    if (CurrentCost > Candidate.SCost) {
-      LLVM_DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      Candidate.SCost = CurrentCost;
-      FoundCandidate = BestCost;
-      continue;
-    }
-
-    // Choose an instruction that does not depend on an artificial edge.
-    unsigned CurrWeak = getWeakLeft(*I, (Q.getID() == TopQID));
-    unsigned CandWeak = getWeakLeft(Candidate.SU, (Q.getID() == TopQID));
-    if (CurrWeak != CandWeak) {
-      if (CurrWeak < CandWeak) {
-        LLVM_DEBUG(traceCandidate("WCAND", Q, *I, CurrentCost));
-        Candidate.SU = *I;
-        Candidate.RPDelta = RPDelta;
-        Candidate.SCost = CurrentCost;
-        FoundCandidate = Weak;
-      }
-      continue;
-    }
-
-    if (CurrentCost == Candidate.SCost && Zone.isLatencyBound(*I)) {
-      unsigned CurrSize, CandSize;
-      if (Q.getID() == TopQID) {
-        CurrSize = (*I)->Succs.size();
-        CandSize = Candidate.SU->Succs.size();
-      } else {
-        CurrSize = (*I)->Preds.size();
-        CandSize = Candidate.SU->Preds.size();
-      }
-      if (CurrSize > CandSize) {
-        LLVM_DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost));
-        Candidate.SU = *I;
-        Candidate.RPDelta = RPDelta;
-        Candidate.SCost = CurrentCost;
-        FoundCandidate = BestCost;
-      }
-      // Keep the old candidate if it's a better candidate. That is, don't use
-      // the subsequent tie breaker.
-      if (CurrSize != CandSize)
-        continue;
-    }
-
-    // Tie breaker.
-    // To avoid scheduling indeterminism, we need a tie breaker
-    // for the case when cost is identical for two nodes.
-    if (UseNewerCandidate && CurrentCost == Candidate.SCost) {
-      if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
-          || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
-        LLVM_DEBUG(traceCandidate("TCAND", Q, *I, CurrentCost));
-        Candidate.SU = *I;
-        Candidate.RPDelta = RPDelta;
-        Candidate.SCost = CurrentCost;
-        FoundCandidate = NodeOrder;
-        continue;
-      }
-    }
-
-    // Fall through to original instruction order.
-    // Only consider node order if Candidate was chosen from this Q.
-    if (FoundCandidate == NoCand)
-      continue;
-  }
-  return FoundCandidate;
-}
-
-/// Pick the best candidate node from either the top or bottom queue.
-SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
-  // Schedule as far as possible in the direction of no choice. This is most
-  // efficient, but also provides the best heuristics for CriticalPSets.
-  if (SUnit *SU = Bot.pickOnlyChoice()) {
-    LLVM_DEBUG(dbgs() << "Picked only Bottom\n");
-    IsTopNode = false;
-    return SU;
-  }
-  if (SUnit *SU = Top.pickOnlyChoice()) {
-    LLVM_DEBUG(dbgs() << "Picked only Top\n");
-    IsTopNode = true;
-    return SU;
-  }
-  SchedCandidate BotCand;
-  // Prefer bottom scheduling when heuristics are silent.
-  CandResult BotResult = pickNodeFromQueue(Bot,
-                                           DAG->getBotRPTracker(), BotCand);
-  assert(BotResult != NoCand && "failed to find the first candidate");
-
-  // If either Q has a single candidate that provides the least increase in
-  // Excess pressure, we can immediately schedule from that Q.
-  //
-  // RegionCriticalPSets summarizes the pressure within the scheduled region and
-  // affects picking from either Q. If scheduling in one direction must
-  // increase pressure for one of the excess PSets, then schedule in that
-  // direction first to provide more freedom in the other direction.
-  if (BotResult == SingleExcess || BotResult == SingleCritical) {
-    LLVM_DEBUG(dbgs() << "Prefered Bottom Node\n");
-    IsTopNode = false;
-    return BotCand.SU;
-  }
-  // Check if the top Q has a better candidate.
-  SchedCandidate TopCand;
-  CandResult TopResult = pickNodeFromQueue(Top,
-                                           DAG->getTopRPTracker(), TopCand);
-  assert(TopResult != NoCand && "failed to find the first candidate");
-
-  if (TopResult == SingleExcess || TopResult == SingleCritical) {
-    LLVM_DEBUG(dbgs() << "Prefered Top Node\n");
-    IsTopNode = true;
-    return TopCand.SU;
-  }
-  // If either Q has a single candidate that minimizes pressure above the
-  // original region's pressure pick it.
-  if (BotResult == SingleMax) {
-    LLVM_DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n");
-    IsTopNode = false;
-    return BotCand.SU;
-  }
-  if (TopResult == SingleMax) {
-    LLVM_DEBUG(dbgs() << "Prefered Top Node SingleMax\n");
-    IsTopNode = true;
-    return TopCand.SU;
-  }
-  if (TopCand.SCost > BotCand.SCost) {
-    LLVM_DEBUG(dbgs() << "Prefered Top Node Cost\n");
-    IsTopNode = true;
-    return TopCand.SU;
-  }
-  // Otherwise prefer the bottom candidate in node order.
-  LLVM_DEBUG(dbgs() << "Prefered Bottom in Node order\n");
-  IsTopNode = false;
-  return BotCand.SU;
-}
-
-/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
-SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
-  if (DAG->top() == DAG->bottom()) {
-    assert(Top.Available.empty() && Top.Pending.empty() &&
-           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
-    return nullptr;
-  }
-  SUnit *SU;
-  if (ForceTopDown) {
-    SU = Top.pickOnlyChoice();
-    if (!SU) {
-      SchedCandidate TopCand;
-      CandResult TopResult =
-        pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
-      assert(TopResult != NoCand && "failed to find the first candidate");
-      (void)TopResult;
-      SU = TopCand.SU;
-    }
-    IsTopNode = true;
-  } else if (ForceBottomUp) {
-    SU = Bot.pickOnlyChoice();
-    if (!SU) {
-      SchedCandidate BotCand;
-      CandResult BotResult =
-        pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
-      assert(BotResult != NoCand && "failed to find the first candidate");
-      (void)BotResult;
-      SU = BotCand.SU;
-    }
-    IsTopNode = false;
-  } else {
-    SU = pickNodeBidrectional(IsTopNode);
-  }
-  if (SU->isTopReady())
-    Top.removeReady(SU);
-  if (SU->isBottomReady())
-    Bot.removeReady(SU);
-
-  LLVM_DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
-                    << " Scheduling instruction in cycle "
-                    << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << " ("
-                    << reportPackets() << ")\n";
-             DAG->dumpNode(*SU));
-  return SU;
-}
-
-/// Update the scheduler's state after scheduling a node. This is the same node
-/// that was just returned by pickNode(). However, VLIWMachineScheduler needs
-/// to update it's state based on the current cycle before MachineSchedStrategy
-/// does.
-void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
-  if (IsTopNode) {
-    Top.bumpNode(SU);
-    SU->TopReadyCycle = Top.CurrCycle;
-  } else {
-    Bot.bumpNode(SU);
-    SU->BotReadyCycle = Bot.CurrCycle;
-  }
-}
diff --git a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
index fb0a7abd339b..3d8f557dc787 100644
--- a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -13,261 +13,28 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINESCHEDULER_H
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINESCHEDULER_H
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/RegisterPressure.h"
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include <algorithm>
-#include <cassert>
-#include <limits>
-#include <memory>
-#include <vector>
+#include "llvm/CodeGen/VLIWMachineScheduler.h"
 
 namespace llvm {
 
 class SUnit;
 
-class VLIWResourceModel {
-  /// ResourcesModel - Represents VLIW state.
-  /// Not limited to VLIW targets per se, but assumes
-  /// definition of DFA by a target.
-  DFAPacketizer *ResourcesModel;
-
-  const TargetSchedModel *SchedModel;
-
-  /// Local packet/bundle model. Purely
-  /// internal to the MI schedulre at the time.
-  std::vector<SUnit *> Packet;
-
-  /// Total packets created.
-  unsigned TotalPackets = 0;
-
+class HexagonVLIWResourceModel : public VLIWResourceModel {
 public:
-  VLIWResourceModel(const TargetSubtargetInfo &STI, const TargetSchedModel *SM)
-      : SchedModel(SM) {
-    ResourcesModel = STI.getInstrInfo()->CreateTargetScheduleState(STI);
-
-    // This hard requirement could be relaxed,
-    // but for now do not let it proceed.
-    assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
-
-    Packet.resize(SchedModel->getIssueWidth());
-    Packet.clear();
-    ResourcesModel->clearResources();
-  }
-
-  ~VLIWResourceModel() {
-    delete ResourcesModel;
-  }
-
-  void resetPacketState() {
-    Packet.clear();
-  }
-
-  void resetDFA() {
-    ResourcesModel->clearResources();
-  }
-
-  void reset() {
-    Packet.clear();
-    ResourcesModel->clearResources();
-  }
-
-  bool isResourceAvailable(SUnit *SU, bool IsTop);
-  bool reserveResources(SUnit *SU, bool IsTop);
-  unsigned getTotalPackets() const { return TotalPackets; }
-  bool isInPacket(SUnit *SU) const { return is_contained(Packet, SU); }
+  using VLIWResourceModel::VLIWResourceModel;
+  bool hasDependence(const SUnit *SUd, const SUnit *SUu) override;
 };
 
-/// Extend the standard ScheduleDAGMI to provide more context and override the
-/// top-level schedule() driver.
-class VLIWMachineScheduler : public ScheduleDAGMILive {
-public:
-  VLIWMachineScheduler(MachineSchedContext *C,
-                       std::unique_ptr<MachineSchedStrategy> S)
-      : ScheduleDAGMILive(C, std::move(S)) {}
-
-  /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
-  /// time to do some work.
-  void schedule() override;
-
-  RegisterClassInfo *getRegClassInfo() { return RegClassInfo; }
-  int getBBSize() { return BB->size(); }
-};
-
-//===----------------------------------------------------------------------===//
-// ConvergingVLIWScheduler - Implementation of the standard
-// MachineSchedStrategy.
-//===----------------------------------------------------------------------===//
-
-/// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics
-/// to balance the schedule.
-class ConvergingVLIWScheduler : public MachineSchedStrategy {
-  /// Store the state used by ConvergingVLIWScheduler heuristics, required
-  ///  for the lifetime of one invocation of pickNode().
-  struct SchedCandidate {
-    // The best SUnit candidate.
-    SUnit *SU = nullptr;
-
-    // Register pressure values for the best candidate.
-    RegPressureDelta RPDelta;
-
-    // Best scheduling cost.
-    int SCost = 0;
-
-    SchedCandidate() = default;
-  };
-  /// Represent the type of SchedCandidate found within a single queue.
-  enum CandResult {
-    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure,
-    BestCost, Weak};
-
-  /// Each Scheduling boundary is associated with ready queues. It tracks the
-  /// current cycle in whichever direction at has moved, and maintains the state
-  /// of "hazards" and other interlocks at the current cycle.
-  struct VLIWSchedBoundary {
-    VLIWMachineScheduler *DAG = nullptr;
-    const TargetSchedModel *SchedModel = nullptr;
-
-    ReadyQueue Available;
-    ReadyQueue Pending;
-    bool CheckPending = false;
-
-    ScheduleHazardRecognizer *HazardRec = nullptr;
-    VLIWResourceModel *ResourceModel = nullptr;
-
-    unsigned CurrCycle = 0;
-    unsigned IssueCount = 0;
-    unsigned CriticalPathLength = 0;
-
-    /// MinReadyCycle - Cycle of the soonest available instruction.
-    unsigned MinReadyCycle = std::numeric_limits<unsigned>::max();
-
-    // Remember the greatest min operand latency.
-    unsigned MaxMinLatency = 0;
-
-    /// Pending queues extend the ready queues with the same ID and the
-    /// PendingFlag set.
-    VLIWSchedBoundary(unsigned ID, const Twine &Name)
-        : Available(ID, Name+".A"),
-          Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P") {}
-
-    ~VLIWSchedBoundary() {
-      delete ResourceModel;
-      delete HazardRec;
-    }
-
-    void init(VLIWMachineScheduler *dag, const TargetSchedModel *smodel) {
-      DAG = dag;
-      SchedModel = smodel;
-      CurrCycle = 0;
-      IssueCount = 0;
-      // Initialize the critical path length limit, which used by the scheduling
-      // cost model to determine the value for scheduling an instruction. We use
-      // a slightly different heuristic for small and large functions. For small
-      // functions, it's important to use the height/depth of the instruction.
-      // For large functions, prioritizing by height or depth increases spills.
-      CriticalPathLength = DAG->getBBSize() / SchedModel->getIssueWidth();
-      if (DAG->getBBSize() < 50)
-        // We divide by two as a cheap and simple heuristic to reduce the
-        // critcal path length, which increases the priority of using the graph
-        // height/depth in the scheduler's cost computation.
-        CriticalPathLength >>= 1;
-      else {
-        // For large basic blocks, we prefer a larger critical path length to
-        // decrease the priority of using the graph height/depth.
-        unsigned MaxPath = 0;
-        for (auto &SU : DAG->SUnits)
-          MaxPath = std::max(MaxPath, isTop() ? SU.getHeight() : SU.getDepth());
-        CriticalPathLength = std::max(CriticalPathLength, MaxPath) + 1;
-      }
-    }
-
-    bool isTop() const {
-      return Available.getID() == ConvergingVLIWScheduler::TopQID;
-    }
-
-    bool checkHazard(SUnit *SU);
-
-    void releaseNode(SUnit *SU, unsigned ReadyCycle);
-
-    void bumpCycle();
-
-    void bumpNode(SUnit *SU);
-
-    void releasePending();
-
-    void removeReady(SUnit *SU);
-
-    SUnit *pickOnlyChoice();
-
-    bool isLatencyBound(SUnit *SU) {
-      if (CurrCycle >= CriticalPathLength)
-        return true;
-      unsigned PathLength = isTop() ? SU->getHeight() : SU->getDepth();
-      return CriticalPathLength - CurrCycle <= PathLength;
-    }
-  };
-
-  VLIWMachineScheduler *DAG = nullptr;
-  const TargetSchedModel *SchedModel = nullptr;
-
-  // State of the top and bottom scheduled instruction boundaries.
-  VLIWSchedBoundary Top;
-  VLIWSchedBoundary Bot;
-
-  /// List of pressure sets that have a high pressure level in the region.
-  std::vector<bool> HighPressureSets;
-
-public:
-  /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
-  enum {
-    TopQID = 1,
-    BotQID = 2,
-    LogMaxQID = 2
-  };
-
-  ConvergingVLIWScheduler() : Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
-
-  void initialize(ScheduleDAGMI *dag) override;
-
-  SUnit *pickNode(bool &IsTopNode) override;
-
-  void schedNode(SUnit *SU, bool IsTopNode) override;
-
-  void releaseTopNode(SUnit *SU) override;
-
-  void releaseBottomNode(SUnit *SU) override;
-
-  unsigned reportPackets() {
-    return Top.ResourceModel->getTotalPackets() +
-           Bot.ResourceModel->getTotalPackets();
-  }
-
+class HexagonConvergingVLIWScheduler : public ConvergingVLIWScheduler {
 protected:
-  SUnit *pickNodeBidrectional(bool &IsTopNode);
-
-  int pressureChange(const SUnit *SU, bool isBotUp);
-
-  int SchedulingCost(ReadyQueue &Q,
-                     SUnit *SU, SchedCandidate &Candidate,
-                     RegPressureDelta &Delta, bool verbose);
-
-  CandResult pickNodeFromQueue(VLIWSchedBoundary &Zone,
-                               const RegPressureTracker &RPTracker,
-                               SchedCandidate &Candidate);
-#ifndef NDEBUG
-  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
-                      int Cost, PressureChange P = PressureChange());
-
-  void readyQueueVerboseDump(const RegPressureTracker &RPTracker,
-                             SchedCandidate &Candidate, ReadyQueue &Q);
-#endif
+  VLIWResourceModel *
+  createVLIWResourceModel(const TargetSubtargetInfo &STI,
+                          const TargetSchedModel *SchedModel) const override;
+  int SchedulingCost(ReadyQueue &Q, SUnit *SU, SchedCandidate &Candidate,
+                     RegPressureDelta &Delta, bool verbose) override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Hexagon/HexagonPseudo.td b/llvm/lib/Target/Hexagon/HexagonPseudo.td
index 11f8af7c41a0..afd63d6d4aa7 100644
--- a/llvm/lib/Target/Hexagon/HexagonPseudo.td
+++ b/llvm/lib/Target/Hexagon/HexagonPseudo.td
@@ -572,3 +572,14 @@ defm PS_storerd : NewCircularStore<DoubleRegs, WordAccess>;
 // __builtin_trap.
 let hasSideEffects = 1, isPseudo = 1, isCodeGenOnly = 1, isSolo = 1 in
 def PS_crash: InstHexagon<(outs), (ins), "", [], "", PSEUDO, TypePSEUDO>;
+
+// This is actual trap1 instruction from before v65. It's here since it is
+// no longer included in DepInstrInfo.td.
+def PS_trap1 : HInst<(outs), (ins u8_0Imm:$Ii), "trap1(#$Ii)", tc_53c851ab,
+                     TypeJ>, Enc_a51a9a, Requires<[HasPreV65]> {
+  let Inst{1-0} = 0b00;
+  let Inst{7-5} = 0b000;
+  let Inst{13-13} = 0b0;
+  let Inst{31-16} = 0b0101010010000000;
+}
+
diff --git a/llvm/lib/Target/Hexagon/HexagonSchedule.td b/llvm/lib/Target/Hexagon/HexagonSchedule.td
index 88d775f16a7f..931578c9e78d 100644
--- a/llvm/lib/Target/Hexagon/HexagonSchedule.td
+++ b/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -69,3 +69,4 @@ include "HexagonScheduleV66.td"
 include "HexagonScheduleV67.td"
 include "HexagonScheduleV67T.td"
 include "HexagonScheduleV68.td"
+include "HexagonScheduleV69.td"
diff --git a/llvm/lib/Target/Hexagon/HexagonScheduleV69.td b/llvm/lib/Target/Hexagon/HexagonScheduleV69.td
new file mode 100644
index 000000000000..ddd246866e20
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonScheduleV69.td
@@ -0,0 +1,40 @@
+//=-HexagonScheduleV69.td - HexagonV69 Scheduling Definitions *- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// ScalarItin and HVXItin contain some old itineraries
+// still used by a handful of instructions. Hopefully, we will be able
+// to get rid of them soon.
+def HexagonV69ItinList : DepScalarItinV69, ScalarItin,
+                         DepHVXItinV69, HVXItin, PseudoItin {
+  list<InstrItinData> ItinList =
+    !listconcat(DepScalarItinV69_list, ScalarItin_list,
+                DepHVXItinV69_list, HVXItin_list, PseudoItin_list);
+}
+
+def HexagonItinerariesV69 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
+                            CVI_ALL_NOMEM, CVI_ZW],
+                            [Hex_FWD, HVX_FWD],
+                            HexagonV69ItinList.ItinList>;
+
+def HexagonModelV69 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV69;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V69 Resource Definitions -
+//===----------------------------------------------------------------------===//
+
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index ecb2f88d8096..08bb4580b585 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -75,6 +75,10 @@ static cl::opt<bool> EnableCheckBankConflict("hexagon-check-bank-conflict",
   cl::Hidden, cl::ZeroOrMore, cl::init(true),
   cl::desc("Enable checking for cache bank conflicts"));
 
+static cl::opt<bool> EnableV68FloatCodeGen(
+    "force-hvx-float", cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Enable the code-generation for vector float instructions on v68."));
+
 HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
                                    StringRef FS, const TargetMachine &TM)
     : HexagonGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
@@ -103,13 +107,71 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   UseAudioOps = false;
   UseLongCalls = false;
 
-  UseBSBScheduling = hasV60Ops() && EnableBSBSched;
+  SubtargetFeatures Features(FS);
+
+  // Turn on QFloat if the HVX version is v68+.
+  // The function ParseSubtargetFeatures will set feature bits and initialize
+  // subtarget's variables all in one, so there isn't a good way to preprocess
+  // the feature string, other than by tinkering with it directly.
+  auto IsQFloatFS = [](StringRef F) {
+    return F == "+hvx-qfloat" || F == "-hvx-qfloat";
+  };
+  if (!llvm::count_if(Features.getFeatures(), IsQFloatFS)) {
+    auto getHvxVersion = [&Features](StringRef FS) -> StringRef {
+      for (StringRef F : llvm::reverse(Features.getFeatures())) {
+        if (F.startswith("+hvxv"))
+          return F;
+      }
+      for (StringRef F : llvm::reverse(Features.getFeatures())) {
+        if (F == "-hvx")
+          return StringRef();
+        if (F.startswith("+hvx") || F == "-hvx")
+          return F.take_front(4);  // Return "+hvx" or "-hvx".
+      }
+      return StringRef();
+    };
+
+    bool AddQFloat = false;
+    StringRef HvxVer = getHvxVersion(FS);
+    if (HvxVer.startswith("+hvxv")) {
+      int Ver = 0;
+      if (!HvxVer.drop_front(5).consumeInteger(10, Ver) && Ver >= 68)
+        AddQFloat = true;
+    } else if (HvxVer == "+hvx") {
+      if (hasV68Ops())
+        AddQFloat = true;
+    }
 
-  ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS);
+    if (AddQFloat)
+      Features.AddFeature("+hvx-qfloat");
+  }
+
+  std::string FeatureString = Features.getString();
+  ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FeatureString);
+
+  // Enable float code generation only if the flag(s) are set and
+  // the feature is enabled. v68 is guarded by additional flags.
+  bool GreaterThanV68 = false;
+  if (useHVXV69Ops())
+    GreaterThanV68 = true;
+
+  // Support for deprecated qfloat/ieee codegen flags
+  if (!GreaterThanV68) {
+    if (EnableV68FloatCodeGen)
+      UseHVXFloatingPoint = true;
+  } else {
+    UseHVXFloatingPoint = true;
+  }
+
+  if (UseHVXQFloatOps && UseHVXIEEEFPOps && UseHVXFloatingPoint)
+    LLVM_DEBUG(
+        dbgs() << "Behavior is undefined for simultaneous qfloat and ieee hvx codegen...");
 
   if (OverrideLongCalls.getPosition())
     UseLongCalls = OverrideLongCalls;
 
+  UseBSBScheduling = hasV60Ops() && EnableBSBSched;
+
   if (isTinyCore()) {
     // Tiny core has a single thread, so back-to-back scheduling is enabled by
     // default.
@@ -117,10 +179,10 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
       UseBSBScheduling = false;
   }
 
-  FeatureBitset Features = getFeatureBits();
+  FeatureBitset FeatureBits = getFeatureBits();
   if (HexagonDisableDuplex)
-    setFeatureBits(Features.reset(Hexagon::FeatureDuplex));
-  setFeatureBits(Hexagon_MC::completeHVXFeatures(Features));
+    setFeatureBits(FeatureBits.reset(Hexagon::FeatureDuplex));
+  setFeatureBits(Hexagon_MC::completeHVXFeatures(FeatureBits));
 
   return *this;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index a4f2e159bf4b..e4f375440be1 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -56,6 +56,10 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   bool UseSmallData = false;
   bool UseUnsafeMath = false;
   bool UseZRegOps = false;
+  bool UseHVXIEEEFPOps = false;
+  bool UseHVXQFloatOps = false;
+  bool UseHVXFloatingPoint = false;
+  bool UseCabac = false;
 
   bool HasPreV65 = false;
   bool HasMemNoShuf = false;
@@ -138,6 +142,8 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
 
+  bool isXRaySupported() const override { return true; }
+
   bool hasV5Ops() const {
     return getHexagonArchVersion() >= Hexagon::ArchEnum::V5;
   }
@@ -186,6 +192,12 @@ public:
   bool hasV68OpsOnly() const {
     return getHexagonArchVersion() == Hexagon::ArchEnum::V68;
   }
+  bool hasV69Ops() const {
+    return getHexagonArchVersion() >= Hexagon::ArchEnum::V69;
+  }
+  bool hasV69OpsOnly() const {
+    return getHexagonArchVersion() == Hexagon::ArchEnum::V69;
+  }
 
   bool useAudioOps() const { return UseAudioOps; }
   bool useCompound() const { return UseCompound; }
@@ -197,10 +209,16 @@ public:
   bool useSmallData() const { return UseSmallData; }
   bool useUnsafeMath() const { return UseUnsafeMath; }
   bool useZRegOps() const { return UseZRegOps; }
+  bool useCabac() const { return UseCabac; }
 
   bool isTinyCore() const { return HexagonProcFamily == TinyCore; }
   bool isTinyCoreWithDuplex() const { return isTinyCore() && EnableDuplex; }
 
+  bool useHVXIEEEFPOps() const { return UseHVXIEEEFPOps && useHVXOps(); }
+  bool useHVXQFloatOps() const {
+    return UseHVXQFloatOps && HexagonHVXVersion >= Hexagon::ArchEnum::V68;
+  }
+  bool useHVXFloatingPoint() const { return UseHVXFloatingPoint; }
   bool useHVXOps() const {
     return HexagonHVXVersion > Hexagon::ArchEnum::NoArch;
   }
@@ -222,6 +240,9 @@ public:
   bool useHVXV68Ops() const {
     return HexagonHVXVersion >= Hexagon::ArchEnum::V68;
   }
+  bool useHVXV69Ops() const {
+    return HexagonHVXVersion >= Hexagon::ArchEnum::V69;
+  }
   bool useHVX128BOps() const { return useHVXOps() && UseHVX128BOps; }
   bool useHVX64BOps() const { return useHVXOps() && UseHVX64BOps; }
 
@@ -281,7 +302,11 @@ public:
   }
 
   ArrayRef<MVT> getHVXElementTypes() const {
-    static MVT Types[] = { MVT::i8, MVT::i16, MVT::i32 };
+    static MVT Types[] = {MVT::i8, MVT::i16, MVT::i32};
+    static MVT TypesV68[] = {MVT::i8, MVT::i16, MVT::i32, MVT::f16, MVT::f32};
+
+    if (useHVXV68Ops() && useHVXFloatingPoint())
+      return makeArrayRef(TypesV68);
     return makeArrayRef(Types);
   }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 66de698182d7..fcf829b522cc 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -21,6 +21,7 @@
 #include "TargetInfo/HexagonTargetInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/VLIWMachineScheduler.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -120,8 +121,8 @@ extern "C" int HexagonTargetMachineModule;
 int HexagonTargetMachineModule = 0;
 
 static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
-  ScheduleDAGMILive *DAG =
-    new VLIWMachineScheduler(C, std::make_unique<ConvergingVLIWScheduler>());
+  ScheduleDAGMILive *DAG = new VLIWMachineScheduler(
+      C, std::make_unique<HexagonConvergingVLIWScheduler>());
   DAG->addMutation(std::make_unique<HexagonSubtarget::UsrOverflowMutation>());
   DAG->addMutation(std::make_unique<HexagonSubtarget::HVXMemLatencyMutation>());
   DAG->addMutation(std::make_unique<HexagonSubtarget::CallMutation>());
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 1d325553f45a..85ec0cdcd8f0 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -294,7 +294,7 @@ bool HexagonPacketizerList::tryAllocateResourcesForConstExt(bool Reserve) {
   bool Avail = ResourceTracker->canReserveResources(*ExtMI);
   if (Reserve && Avail)
     ResourceTracker->reserveResources(*ExtMI);
-  MF.DeleteMachineInstr(ExtMI);
+  MF.deleteMachineInstr(ExtMI);
   return Avail;
 }
 
@@ -890,7 +890,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
   const MCInstrDesc &D = HII->get(NewOpcode);
   MachineInstr *NewMI = MF.CreateMachineInstr(D, DebugLoc());
   bool ResourcesAvailable = ResourceTracker->canReserveResources(*NewMI);
-  MF.DeleteMachineInstr(NewMI);
+  MF.deleteMachineInstr(NewMI);
   if (!ResourcesAvailable)
     return false;
 
@@ -1082,6 +1082,11 @@ bool HexagonPacketizerList::isSoloInstruction(const MachineInstr &MI) {
   if (HII->isSolo(MI))
     return true;
 
+  if (MI.getOpcode() == Hexagon::PATCHABLE_FUNCTION_ENTER ||
+      MI.getOpcode() == Hexagon::PATCHABLE_FUNCTION_EXIT ||
+      MI.getOpcode() == Hexagon::PATCHABLE_TAIL_CALL)
+    return true;
+
   if (MI.getOpcode() == Hexagon::A2_nop)
     return true;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index ea2798a3b44e..21386a91c7b3 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -536,7 +536,7 @@ auto AlignVectors::createAddressGroups() -> bool {
   erase_if(AddrGroups, [](auto &G) { return G.second.size() == 1; });
   // Remove groups that don't use HVX types.
   erase_if(AddrGroups, [&](auto &G) {
-    return !llvm::any_of(
+    return llvm::none_of(
         G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
   });
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 4125566bc58a..c9a1781a4543 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -154,9 +154,8 @@ namespace HexagonII {
     PrefersSlot3Pos = 57,
     PrefersSlot3Mask = 0x1,
 
-    // v65
-    HasTmpDstPos = 60,
-    HasTmpDstMask = 0x1,
+    HasHvxTmpPos = 60,
+    HasHvxTmpMask = 0x1,
 
     CVINewPos = 62,
     CVINewMask = 0x1,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index fee1acdbbe8a..96c2965296ca 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -98,6 +98,10 @@ void HexagonMCChecker::init(MCInst const &MCI) {
   for (unsigned i = 0; i < MCID.getNumImplicitUses(); ++i)
     initReg(MCI, MCID.getImplicitUses()[i], PredReg, isTrue);
 
+  const bool IgnoreTmpDst = (HexagonMCInstrInfo::hasTmpDst(MCII, MCI) ||
+                             HexagonMCInstrInfo::hasHvxTmp(MCII, MCI)) &&
+                            STI.getFeatureBits()[Hexagon::ArchV69];
+
   // Get implicit register definitions.
   if (const MCPhysReg *ImpDef = MCID.getImplicitDefs())
     for (; *ImpDef; ++ImpDef) {
@@ -123,7 +127,7 @@ void HexagonMCChecker::init(MCInst const &MCI) {
                HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
         // Include implicit late predicates.
         LatePreds.insert(R);
-      else
+      else if (!IgnoreTmpDst)
         Defs[R].insert(PredSense(PredReg, isTrue));
     }
 
@@ -178,7 +182,7 @@ void HexagonMCChecker::init(MCInst const &MCI) {
         // vshuff(Vx, Vy, Rx) <- Vx(0) and Vy(1) are both source and
         // destination registers with this instruction. same for vdeal(Vx,Vy,Rx)
         Uses.insert(*SRI);
-      else
+      else if (!IgnoreTmpDst)
         Defs[*SRI].insert(PredSense(PredReg, isTrue));
     }
   }
@@ -227,9 +231,11 @@ bool HexagonMCChecker::check(bool FullCheck) {
   bool chkAXOK = checkAXOK();
   bool chkCofMax1 = checkCOFMax1();
   bool chkHWLoop = checkHWLoop();
+  bool chkValidTmpDst = FullCheck ? checkValidTmpDst() : true;
   bool chkLegalVecRegPair = checkLegalVecRegPair();
   bool chk = chkP && chkNV && chkR && chkRRO && chkS && chkSh && chkSl &&
-             chkAXOK && chkCofMax1 && chkHWLoop && chkLegalVecRegPair;
+             chkAXOK && chkCofMax1 && chkHWLoop && chkValidTmpDst &&
+             chkLegalVecRegPair;
 
   return chk;
 }
@@ -676,6 +682,32 @@ bool HexagonMCChecker::checkShuffle() {
   return MCSDX.check();
 }
 
+bool HexagonMCChecker::checkValidTmpDst() {
+  if (!STI.getFeatureBits()[Hexagon::ArchV69]) {
+    return true;
+  }
+  auto HasTmp = [&](MCInst const &I) {
+    return HexagonMCInstrInfo::hasTmpDst(MCII, I) ||
+           HexagonMCInstrInfo::hasHvxTmp(MCII, I);
+  };
+  unsigned HasTmpCount =
+      llvm::count_if(HexagonMCInstrInfo::bundleInstructions(MCII, MCB), HasTmp);
+
+  if (HasTmpCount > 1) {
+    reportError(
+        MCB.getLoc(),
+        "this packet has more than one HVX vtmp/.tmp destination instruction");
+
+    for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB))
+      if (HasTmp(I))
+        reportNote(I.getLoc(),
+                   "this is an HVX vtmp/.tmp destination instruction");
+
+    return false;
+  }
+  return true;
+}
+
 void HexagonMCChecker::compoundRegisterMap(unsigned &Register) {
   switch (Register) {
   default:
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index 00afdb664ba5..dbd3d8ae45e6 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -99,6 +99,7 @@ class HexagonMCChecker {
   bool checkHWLoop();
   bool checkCOFMax1();
   bool checkLegalVecRegPair();
+  bool checkValidTmpDst();
 
   static void compoundRegisterMap(unsigned &);
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index fa12fe1da448..68ccb20f4f15 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -939,10 +939,24 @@ bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII,
   return (F >> HexagonII::PrefersSlot3Pos) & HexagonII::PrefersSlot3Mask;
 }
 
-/// return true if instruction has hasTmpDst attribute.
 bool HexagonMCInstrInfo::hasTmpDst(MCInstrInfo const &MCII, MCInst const &MCI) {
+  switch (MCI.getOpcode()) {
+  default:
+    return false;
+  case Hexagon::V6_vgathermh:
+  case Hexagon::V6_vgathermhq:
+  case Hexagon::V6_vgathermhw:
+  case Hexagon::V6_vgathermhwq:
+  case Hexagon::V6_vgathermw:
+  case Hexagon::V6_vgathermwq:
+    return true;
+  }
+  return false;
+}
+
+bool HexagonMCInstrInfo::hasHvxTmp(MCInstrInfo const &MCII, MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return (F >> HexagonII::HasTmpDstPos) & HexagonII::HasTmpDstMask;
+  return (F >> HexagonII::HasHvxTmpPos) & HexagonII::HasHvxTmpMask;
 }
 
 bool HexagonMCInstrInfo::requiresSlot(MCSubtargetInfo const &STI,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index 7b3c079880f8..5c56db14798f 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -41,7 +41,8 @@ public:
 
 namespace Hexagon {
 
-class PacketIterator {
+class PacketIterator : public std::iterator<std::forward_iterator_tag,
+    PacketIterator> {
   MCInstrInfo const &MCII;
   MCInst::const_iterator BundleCurrent;
   MCInst::const_iterator BundleEnd;
@@ -188,6 +189,7 @@ bool hasImmExt(MCInst const &MCI);
 bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
 bool hasNewValue2(MCInstrInfo const &MCII, MCInst const &MCI);
 bool hasTmpDst(MCInstrInfo const &MCII, MCInst const &MCI);
+bool hasHvxTmp(MCInstrInfo const &MCII, MCInst const &MCI);
 unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb);
 
 int64_t minConstant(MCInst const &MCI, size_t Index);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index d832a756cb92..dfdddb50657c 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -80,6 +80,8 @@ cl::opt<bool> MV67T("mv67t", cl::Hidden, cl::desc("Build for Hexagon V67T"),
                     cl::init(false));
 cl::opt<bool> MV68("mv68", cl::Hidden, cl::desc("Build for Hexagon V68"),
                    cl::init(false));
+cl::opt<bool> MV69("mv69", cl::Hidden, cl::desc("Build for Hexagon V69"),
+                   cl::init(false));
 
 cl::opt<Hexagon::ArchEnum>
     EnableHVX("mhvx",
@@ -91,6 +93,7 @@ cl::opt<Hexagon::ArchEnum>
         clEnumValN(Hexagon::ArchEnum::V66, "v66", "Build for HVX v66"),
         clEnumValN(Hexagon::ArchEnum::V67, "v67", "Build for HVX v67"),
         clEnumValN(Hexagon::ArchEnum::V68, "v68", "Build for HVX v68"),
+        clEnumValN(Hexagon::ArchEnum::V69, "v69", "Build for HVX v69"),
         // Sentinel for no value specified.
         clEnumValN(Hexagon::ArchEnum::Generic, "", "")),
       // Sentinel for flag not present.
@@ -101,6 +104,11 @@ static cl::opt<bool>
   DisableHVX("mno-hvx", cl::Hidden,
              cl::desc("Disable Hexagon Vector eXtensions"));
 
+static cl::opt<bool>
+    EnableHvxIeeeFp("mhvx-ieee-fp", cl::Hidden,
+                    cl::desc("Enable HVX IEEE floating point extensions"));
+static cl::opt<bool> EnableHexagonCabac
+  ("mcabac", cl::desc("tbd"), cl::init(false));
 
 static StringRef DefaultArch = "hexagonv60";
 
@@ -123,6 +131,8 @@ static StringRef HexagonGetArchVariant() {
     return "hexagonv67t";
   if (MV68)
     return "hexagonv68";
+  if (MV69)
+    return "hexagonv69";
   return "";
 }
 
@@ -371,6 +381,9 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
   case Hexagon::ArchEnum::V68:
     Result.push_back("+hvxv68");
     break;
+  case Hexagon::ArchEnum::V69:
+    Result.push_back("+hvxv69");
+    break;
   case Hexagon::ArchEnum::Generic:{
     Result.push_back(StringSwitch<StringRef>(CPU)
              .Case("hexagonv60", "+hvxv60")
@@ -379,13 +392,19 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
              .Case("hexagonv66", "+hvxv66")
              .Case("hexagonv67", "+hvxv67")
              .Case("hexagonv67t", "+hvxv67")
-             .Case("hexagonv68", "+hvxv68"));
+             .Case("hexagonv68", "+hvxv68")
+             .Case("hexagonv69", "+hvxv69"));
     break;
   }
   case Hexagon::ArchEnum::NoArch:
     // Sentinel if -mhvx isn't specified
     break;
   }
+  if (EnableHvxIeeeFp)
+    Result.push_back("+hvx-ieee-fp");
+  if (EnableHexagonCabac)
+    Result.push_back("+cabac");
+
   return join(Result.begin(), Result.end(), ",");
 }
 }
@@ -422,8 +441,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
   // turns on hvxvNN, corresponding to the existing ArchVNN.
   FeatureBitset FB = S;
   unsigned CpuArch = ArchV5;
-  for (unsigned F : {ArchV68, ArchV67, ArchV66, ArchV65, ArchV62, ArchV60,
-                     ArchV55, ArchV5}) {
+  for (unsigned F : {ArchV69, ArchV68, ArchV67, ArchV66, ArchV65, ArchV62,
+                     ArchV60, ArchV55, ArchV5}) {
     if (!FB.test(F))
       continue;
     CpuArch = F;
@@ -438,7 +457,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
   }
   bool HasHvxVer = false;
   for (unsigned F : {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65,
-                     ExtensionHVXV66, ExtensionHVXV67, ExtensionHVXV68}) {
+                     ExtensionHVXV66, ExtensionHVXV67, ExtensionHVXV68,
+                     ExtensionHVXV69}) {
     if (!FB.test(F))
       continue;
     HasHvxVer = true;
@@ -451,6 +471,9 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
 
   // HasHvxVer is false, and UseHvx is true.
   switch (CpuArch) {
+  case ArchV69:
+    FB.set(ExtensionHVXV69);
+    LLVM_FALLTHROUGH;
     case ArchV68:
       FB.set(ExtensionHVXV68);
       LLVM_FALLTHROUGH;
@@ -538,6 +561,7 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
     {"hexagonv67", ELF::EF_HEXAGON_MACH_V67},
     {"hexagonv67t", ELF::EF_HEXAGON_MACH_V67T},
     {"hexagonv68", ELF::EF_HEXAGON_MACH_V68},
+    {"hexagonv69", ELF::EF_HEXAGON_MACH_V69},
   };
 
   auto F = ElfFlags.find(STI.getCPU());
diff --git a/llvm/lib/Target/M68k/M68kInstrControl.td b/llvm/lib/Target/M68k/M68kInstrControl.td
index 708474726861..9f87833ab0e2 100644
--- a/llvm/lib/Target/M68k/M68kInstrControl.td
+++ b/llvm/lib/Target/M68k/M68kInstrControl.td
@@ -118,13 +118,13 @@ def SET#"p8"#cc : MxSccM<cc, MxType8.POp, MxType8.PPat, MxEncEAp_0, MxExtI16_0>;
 /// 0  1  0  0  1  1  1  0  1  1 |  MODE   |   REG
 ///------------------------------+---------+---------
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
-class MxJMP<MxOperand LOCOp, ComplexPattern LOCPat, MxEncEA EA, MxEncExt EXT>
+class MxJMP<MxOperand LOCOp, MxEncEA EA, MxEncExt EXT>
     : MxInst<(outs), (ins LOCOp:$dst), "jmp\t$dst", [(brind iPTR:$dst)],
              MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead2Bits<0b11>,
                         MxBead4Bits<0b1110>, MxBead4Bits<0b0100>,
                         EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>;
 
-def JMP32j : MxJMP<MxARI32, MxCP_ARI, MxEncEAj_0, MxExtEmpty>;
+def JMP32j : MxJMP<MxARI32, MxEncEAj_0, MxExtEmpty>;
 
 
 // FIXME Support 16 bit indirect jump.
@@ -147,17 +147,17 @@ def JMP32j : MxJMP<MxARI32, MxCP_ARI, MxEncEAj_0, MxExtEmpty>;
 ///  32-BIT DISPLACEMENT IF 8-BIT DISPLACEMENT = $FF
 /// --------------------------------------------------
 let isBranch = 1, isTerminator = 1, Uses = [CCR] in
-class MxBcc<string cc, Operand TARGET, MxType TYPE, MxEncoding ENC = MxEncEmpty>
+class MxBcc<string cc, Operand TARGET, MxEncoding ENC = MxEncEmpty>
     : MxInst<(outs), (ins TARGET:$dst), "b"#cc#"\t$dst", [], ENC>;
 
 foreach cc = [ "cc", "ls", "lt", "eq", "mi", "ne", "ge",
                "cs", "pl", "gt", "hi", "vc", "le", "vs"] in {
   def B#cc#"8"
-    : MxBcc<cc, MxBrTarget8, MxType8,
+    : MxBcc<cc, MxBrTarget8,
             MxEncoding<MxBead8Disp<0>,
                        !cast<MxBead4Bits>("MxCC"#cc), MxBead4Bits<0x6>>>;
   def B#cc#"16"
-    : MxBcc<cc, MxBrTarget16, MxType16,
+    : MxBcc<cc, MxBrTarget16,
             MxEncoding<MxBead4Bits<0x0>,
                        MxBead4Bits<0x0>, !cast<MxBead4Bits>("MxCC"#cc),
                        MxBead4Bits<0x6>, MxBead16Imm<0>>>;
@@ -179,13 +179,13 @@ def : Pat<(MxBrCond bb:$target, !cast<PatLeaf>("MxCOND"#cc), CCR),
 ///  32-BIT DISPLACEMENT IF 8-BIT DISPLACEMENT = $FF
 /// -------------------------------------------------
 let isBranch = 1, isTerminator = 1, isBarrier=1 in
-class MxBra<Operand TARGET, MxType TYPE, MxEncoding ENC = MxEncEmpty>
+class MxBra<Operand TARGET, MxEncoding ENC = MxEncEmpty>
     : MxInst<(outs), (ins TARGET:$dst), "bra\t$dst", [], ENC>;
 
-def BRA8  : MxBra<MxBrTarget8, MxType8,
+def BRA8  : MxBra<MxBrTarget8,
                   MxEncoding<MxBead8Disp<0>, MxBead4Bits<0x0>,
                              MxBead4Bits<0x6>>>;
-def BRA16 : MxBra<MxBrTarget16, MxType16,
+def BRA16 : MxBra<MxBrTarget16,
                   MxEncoding<MxBead4Bits<0x0>, MxBead4Bits<0x0>,
                              MxBead4Bits<0x0>, MxBead4Bits<0x6>,
                              MxBead16Imm<0>>>;
diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
index 2a77a150f9aa..4ef9a567d453 100644
--- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -189,8 +189,8 @@ bool MSP430FrameLowering::spillCalleeSavedRegisters(
   MSP430MachineFunctionInfo *MFI = MF.getInfo<MSP430MachineFunctionInfo>();
   MFI->setCalleeSavedFrameSize(CSI.size() * 2);
 
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i-1].getReg();
+  for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
+    unsigned Reg = I.getReg();
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
     BuildMI(MBB, MI, DL, TII.get(MSP430::PUSH16r))
diff --git a/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
index 203e05dde7ad..419f0ac1a8a7 100644
--- a/llvm/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -479,14 +479,12 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
 
 // remove the use-soft-float attribute
 static void removeUseSoftFloat(Function &F) {
-  AttrBuilder B;
   LLVM_DEBUG(errs() << "removing -use-soft-float\n");
-  B.addAttribute("use-soft-float", "false");
-  F.removeFnAttrs(B);
+  F.removeFnAttr("use-soft-float");
   if (F.hasFnAttribute("use-soft-float")) {
     LLVM_DEBUG(errs() << "still has -use-soft-float\n");
   }
-  F.addFnAttrs(B);
+  F.addFnAttr("use-soft-float", "false");
 }
 
 // This pass only makes sense when the underlying chip has floating point but
diff --git a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
index aa8e298fa759..4e9a23d077da 100644
--- a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
+++ b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
@@ -36,7 +36,7 @@
 ///
 /// Regarding compact branch hazard prevention:
 ///
-/// Hazards handled: forbidden slots for MIPSR6.
+/// Hazards handled: forbidden slots for MIPSR6, FPU slots for MIPS3 and below.
 ///
 /// A forbidden slot hazard occurs when a compact branch instruction is executed
 /// and the adjacent instruction in memory is a control transfer instruction
@@ -160,7 +160,10 @@ private:
   bool buildProperJumpMI(MachineBasicBlock *MBB,
                          MachineBasicBlock::iterator Pos, DebugLoc DL);
   void expandToLongBranch(MBBInfo &Info);
+  template <typename Pred, typename Safe>
+  bool handleSlot(Pred Predicate, Safe SafeInSlot);
   bool handleForbiddenSlot();
+  bool handleFPUDelaySlot();
   bool handlePossibleLongBranch();
 
   const MipsSubtarget *STI;
@@ -738,30 +741,27 @@ static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
   MBB.removeLiveIn(Mips::V0);
 }
 
-bool MipsBranchExpansion::handleForbiddenSlot() {
-  // Forbidden slot hazards are only defined for MIPSR6 but not microMIPSR6.
-  if (!STI->hasMips32r6() || STI->inMicroMipsMode())
-    return false;
-
+template <typename Pred, typename Safe>
+bool MipsBranchExpansion::handleSlot(Pred Predicate, Safe SafeInSlot) {
   bool Changed = false;
 
   for (MachineFunction::iterator FI = MFp->begin(); FI != MFp->end(); ++FI) {
     for (Iter I = FI->begin(); I != FI->end(); ++I) {
 
-      // Forbidden slot hazard handling. Use lookahead over state.
-      if (!TII->HasForbiddenSlot(*I))
+      // Delay slot hazard handling. Use lookahead over state.
+      if (!Predicate(*I))
         continue;
 
-      Iter Inst;
+      Iter IInSlot;
       bool LastInstInFunction =
           std::next(I) == FI->end() && std::next(FI) == MFp->end();
       if (!LastInstInFunction) {
         std::pair<Iter, bool> Res = getNextMachineInstr(std::next(I), &*FI);
         LastInstInFunction |= Res.second;
-        Inst = Res.first;
+        IInSlot = Res.first;
       }
 
-      if (LastInstInFunction || !TII->SafeInForbiddenSlot(*Inst)) {
+      if (LastInstInFunction || !SafeInSlot(*IInSlot, *I)) {
 
         MachineBasicBlock::instr_iterator Iit = I->getIterator();
         if (std::next(Iit) == FI->end() ||
@@ -778,6 +778,29 @@ bool MipsBranchExpansion::handleForbiddenSlot() {
   return Changed;
 }
 
+bool MipsBranchExpansion::handleForbiddenSlot() {
+  // Forbidden slot hazards are only defined for MIPSR6 but not microMIPSR6.
+  if (!STI->hasMips32r6() || STI->inMicroMipsMode())
+    return false;
+
+  return handleSlot(
+      [this](auto &I) -> bool { return TII->HasForbiddenSlot(I); },
+      [this](auto &IInSlot, auto &I) -> bool {
+        return TII->SafeInForbiddenSlot(IInSlot);
+      });
+}
+
+bool MipsBranchExpansion::handleFPUDelaySlot() {
+  // FPU delay slots are only defined for MIPS3 and below.
+  if (STI->hasMips32() || STI->hasMips4())
+    return false;
+
+  return handleSlot([this](auto &I) -> bool { return TII->HasFPUDelaySlot(I); },
+                    [this](auto &IInSlot, auto &I) -> bool {
+                      return TII->SafeInFPUDelaySlot(IInSlot, I);
+                    });
+}
+
 bool MipsBranchExpansion::handlePossibleLongBranch() {
   if (STI->inMips16Mode() || !STI->enableLongBranchPass())
     return false;
@@ -857,13 +880,16 @@ bool MipsBranchExpansion::runOnMachineFunction(MachineFunction &MF) {
   // Run these two at least once
   bool longBranchChanged = handlePossibleLongBranch();
   bool forbiddenSlotChanged = handleForbiddenSlot();
+  bool fpuDelaySlotChanged = handleFPUDelaySlot();
 
-  bool Changed = longBranchChanged || forbiddenSlotChanged;
+  bool Changed =
+      longBranchChanged || forbiddenSlotChanged || fpuDelaySlotChanged;
 
   // Then run them alternatively while there are changes
   while (forbiddenSlotChanged) {
     longBranchChanged = handlePossibleLongBranch();
-    if (!longBranchChanged)
+    fpuDelaySlotChanged = handleFPUDelaySlot();
+    if (!longBranchChanged && !fpuDelaySlotChanged)
       break;
     forbiddenSlotChanged = handleForbiddenSlot();
   }
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 4f364ef6afc7..9377e83524e1 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -4121,7 +4121,7 @@ MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
     case 'y': // Same as 'r'. Exists for compatibility.
     case 'r':
-      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8 || VT == MVT::i1) {
         if (Subtarget.inMips16Mode())
           return std::make_pair(0U, &Mips::CPU16RegsRegClass);
         return std::make_pair(0U, &Mips::GPR32RegClass);
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index 94828a976695..2bf8562895d7 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -568,11 +568,60 @@ bool MipsInstrInfo::SafeInForbiddenSlot(const MachineInstr &MI) const {
   return (MI.getDesc().TSFlags & MipsII::IsCTI) == 0;
 }
 
+bool MipsInstrInfo::SafeInFPUDelaySlot(const MachineInstr &MIInSlot,
+                                       const MachineInstr &FPUMI) const {
+  if (MIInSlot.isInlineAsm())
+    return false;
+
+  if (HasFPUDelaySlot(MIInSlot))
+    return false;
+
+  switch (MIInSlot.getOpcode()) {
+  case Mips::BC1F:
+  case Mips::BC1FL:
+  case Mips::BC1T:
+  case Mips::BC1TL:
+    return false;
+  }
+
+  for (const MachineOperand &Op : FPUMI.defs()) {
+    if (!Op.isReg())
+      continue;
+
+    bool Reads, Writes;
+    std::tie(Reads, Writes) = MIInSlot.readsWritesVirtualRegister(Op.getReg());
+
+    if (Reads || Writes)
+      return false;
+  }
+
+  return true;
+}
+
 /// Predicate for distingushing instructions that have forbidden slots.
 bool MipsInstrInfo::HasForbiddenSlot(const MachineInstr &MI) const {
   return (MI.getDesc().TSFlags & MipsII::HasForbiddenSlot) != 0;
 }
 
+/// Predicate for distingushing instructions that have FPU delay slots.
+bool MipsInstrInfo::HasFPUDelaySlot(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case Mips::MTC1:
+  case Mips::MFC1:
+  case Mips::MTC1_D64:
+  case Mips::MFC1_D64:
+  case Mips::DMTC1:
+  case Mips::DMFC1:
+  case Mips::FCMP_S32:
+  case Mips::FCMP_D32:
+  case Mips::FCMP_D64:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
 /// Return the number of bytes of code the specified instruction may be.
 unsigned MipsInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h
index c96ed202df30..46c1b73d512f 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -92,9 +92,16 @@ public:
   /// Predicate to determine if an instruction can go in a forbidden slot.
   bool SafeInForbiddenSlot(const MachineInstr &MI) const;
 
+  /// Predicate to determine if an instruction can go in an FPU delay slot.
+  bool SafeInFPUDelaySlot(const MachineInstr &MIInSlot,
+                          const MachineInstr &FPUMI) const;
+
   /// Predicate to determine if an instruction has a forbidden slot.
   bool HasForbiddenSlot(const MachineInstr &MI) const;
 
+  /// Predicate to determine if an instruction has an FPU delay slot.
+  bool HasFPUDelaySlot(const MachineInstr &MI) const;
+
   /// Insert nop instruction when hazard condition is found
   void insertNoop(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator MI) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index c35e67d6726f..16add48d4602 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1098,10 +1098,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     O << " .attribute(.managed)";
   }
 
-  if (GVar->getAlignment() == 0)
-    O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
+  if (MaybeAlign A = GVar->getAlign())
+    O << " .align " << A->value();
   else
-    O << " .align " << GVar->getAlignment();
+    O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
 
   if (ETy->isFloatingPointTy() || ETy->isPointerTy() ||
       (ETy->isIntegerTy() && ETy->getScalarSizeInBits() <= 64)) {
@@ -1290,10 +1290,10 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
 
   O << ".";
   emitPTXAddressSpace(GVar->getType()->getAddressSpace(), O);
-  if (GVar->getAlignment() == 0)
-    O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
+  if (MaybeAlign A = GVar->getAlign())
+    O << " .align " << A->value();
   else
-    O << " .align " << GVar->getAlignment();
+    O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
 
   // Special case for i128
   if (ETy->isIntegerTy(128)) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp b/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
index 1f3b4c9440d8..bf3c87df2e08 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -126,9 +126,9 @@ static void CombineCVTAToLocal(MachineInstr &Root) {
 
   // Check if MRI has only one non dbg use, which is Root
   if (MRI.hasOneNonDBGUse(Prev.getOperand(0).getReg())) {
-    Prev.eraseFromParentAndMarkDBGValuesForRemoval();
+    Prev.eraseFromParent();
   }
-  Root.eraseFromParentAndMarkDBGValuesForRemoval();
+  Root.eraseFromParent();
 }
 
 bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) {
@@ -157,7 +157,7 @@ bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) {
   const auto &MRI = MF.getRegInfo();
   if (MRI.use_empty(NRI->getFrameRegister(MF))) {
     if (auto MI = MRI.getUniqueVRegDef(NRI->getFrameRegister(MF))) {
-      MI->eraseFromParentAndMarkDBGValuesForRemoval();
+      MI->eraseFromParent();
     }
   }
 
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 9e181d4052d6..ded922329ebf 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -1576,6 +1576,16 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     std::swap(Operands[2], Operands[1]);
   }
 
+  // Handle base mnemonic for atomic loads where the EH bit is zero.
+  if (Name == "lqarx" || Name == "ldarx" || Name == "lwarx" ||
+      Name == "lharx" || Name == "lbarx") {
+    if (Operands.size() != 5)
+      return false;
+    PPCOperand &EHOp = (PPCOperand &)*Operands[4];
+    if (EHOp.isU1Imm() && EHOp.getImm() == 0)
+      Operands.pop_back();
+  }
+
   return false;
 }
 
@@ -1745,7 +1755,7 @@ unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   }
 
   PPCOperand &Op = static_cast<PPCOperand &>(AsmOp);
-  if (Op.isImm() && Op.getImm() == ImmVal)
+  if (Op.isU3Imm() && Op.getImm() == ImmVal)
     return Match_Success;
 
   return Match_InvalidOperand;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 22b948a83c34..d6e02d0d0862 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -368,6 +369,31 @@ static MCInstPrinter *createPPCMCInstPrinter(const Triple &T,
   return new PPCInstPrinter(MAI, MII, MRI, T);
 }
 
+namespace {
+
+class PPCMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  explicit PPCMCInstrAnalysis(const MCInstrInfo *Info)
+      : MCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override {
+    unsigned NumOps = Inst.getNumOperands();
+    if (NumOps == 0 ||
+        Info->get(Inst.getOpcode()).OpInfo[NumOps - 1].OperandType !=
+            MCOI::OPERAND_PCREL)
+      return false;
+    Target = Addr + Inst.getOperand(NumOps - 1).getImm() * Size;
+    return true;
+  }
+};
+
+} // end anonymous namespace
+
+static MCInstrAnalysis *createPPCMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new PPCMCInstrAnalysis(Info);
+}
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() {
   for (Target *T : {&getThePPC32Target(), &getThePPC32LETarget(),
                     &getThePPC64Target(), &getThePPC64LETarget()}) {
@@ -383,6 +409,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() {
     // Register the MC subtarget info.
     TargetRegistry::RegisterMCSubtargetInfo(*T, createPPCMCSubtargetInfo);
 
+    // Register the MC instruction analyzer.
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createPPCMCInstrAnalysis);
+
     // Register the MC Code Emitter
     TargetRegistry::RegisterMCCodeEmitter(*T, createPPCMCCodeEmitter);
 
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index 422bd11dca52..bbd5f5fd1941 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -219,6 +219,10 @@ def FeatureZeroMoveFusion:
   SubtargetFeature<"fuse-zeromove", "HasZeroMoveFusion", "true",
                    "Target supports move to SPR with branch fusion",
                    [FeatureFusion]>;
+def FeatureBack2BackFusion:
+  SubtargetFeature<"fuse-back2back", "HasBack2BackFusion", "true",
+                   "Target supports general back to back fusion",
+                   [FeatureFusion]>;
 def FeatureUnalignedFloats :
   SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
                    "true", "CPU does not trap on unaligned FP access">;
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 16e3b2b85c2e..f26c15667a0b 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -347,7 +347,6 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
 // At the moment, all inline asm memory operands are a single register.
 // In any case, the output of this routine should always be just one
 // assembler operand.
-
 bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                           const char *ExtraCode,
                                           raw_ostream &O) {
diff --git a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
new file mode 100644
index 000000000000..38ed5f2e78e3
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
@@ -0,0 +1,1042 @@
+// Automatically generated file, do not edit!
+//
+// This file defines instruction list for general back2back fusion.
+//===----------------------------------------------------------------------===//
+FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1,
+  FUSION_OP_SET(ADD4,
+    ADD4O,
+    ADD4TLS,
+    ADD4_rec,
+    ADD8,
+    ADD8O,
+    ADD8TLS,
+    ADD8TLS_,
+    ADD8_rec,
+    ADDE,
+    ADDE8,
+    ADDE8O,
+    ADDEO,
+    ADDEX,
+    ADDEX8,
+    ADDI,
+    ADDI8,
+    ADDIC,
+    ADDIC8,
+    ADDIS,
+    ADDIS8,
+    ADDISdtprelHA32,
+    ADDIStocHA,
+    ADDIStocHA8,
+    ADDIdtprelL32,
+    ADDItlsldLADDR32,
+    ADDItocL,
+    ADDME,
+    ADDME8,
+    ADDME8O,
+    ADDMEO,
+    ADDZE,
+    ADDZE8,
+    ADDZE8O,
+    ADDZEO,
+    AND,
+    AND8,
+    AND8_rec,
+    ANDC,
+    ANDC8,
+    ANDC8_rec,
+    ANDC_rec,
+    ANDI8_rec,
+    ANDIS8_rec,
+    ANDIS_rec,
+    ANDI_rec,
+    AND_rec,
+    CMPB,
+    CMPB8,
+    CNTLZD,
+    CNTLZD_rec,
+    CNTLZW,
+    CNTLZW8,
+    CNTLZW8_rec,
+    CNTLZW_rec,
+    CNTTZD,
+    CNTTZD_rec,
+    CNTTZW,
+    CNTTZW8,
+    CNTTZW8_rec,
+    CNTTZW_rec,
+    EQV,
+    EQV8,
+    EQV8_rec,
+    EQV_rec,
+    EXTSB,
+    EXTSB8,
+    EXTSB8_32_64,
+    EXTSB8_rec,
+    EXTSB_rec,
+    EXTSH,
+    EXTSH8,
+    EXTSH8_32_64,
+    EXTSH8_rec,
+    EXTSH_rec,
+    EXTSW,
+    EXTSWSLI,
+    EXTSWSLI_32_64,
+    EXTSWSLI_32_64_rec,
+    EXTSWSLI_rec,
+    EXTSW_32,
+    EXTSW_32_64,
+    EXTSW_32_64_rec,
+    EXTSW_rec,
+    FABSD,
+    FABSS,
+    FCPSGND,
+    FCPSGNS,
+    FMR,
+    FNABSD,
+    FNABSS,
+    FNEGD,
+    FNEGS,
+    ISEL,
+    ISEL8,
+    LI,
+    LI8,
+    LIS,
+    LIS8,
+    MFCTR,
+    MFCTR8,
+    MFLR,
+    MFLR8,
+    MFOCRF,
+    MFOCRF8,
+    MFVRD,
+    MFVRWZ,
+    MFVSRD,
+    MFVSRWZ,
+    MTVRD,
+    MTVRWA,
+    MTVRWZ,
+    MTVSRBM,
+    MTVSRBMI,
+    MTVSRD,
+    MTVSRDM,
+    MTVSRHM,
+    MTVSRQM,
+    MTVSRWA,
+    MTVSRWM,
+    MTVSRWZ,
+    NAND,
+    NAND8,
+    NAND8_rec,
+    NAND_rec,
+    NEG,
+    NEG8,
+    NEG8O,
+    NEG8_rec,
+    NEGO,
+    NEG_rec,
+    NOP,
+    NOP_GT_PWR6,
+    NOP_GT_PWR7,
+    NOR,
+    NOR8,
+    NOR8_rec,
+    NOR_rec,
+    OR,
+    OR8,
+    OR8_rec,
+    ORC,
+    ORC8,
+    ORC8_rec,
+    ORC_rec,
+    ORI,
+    ORI8,
+    ORIS,
+    ORIS8,
+    OR_rec,
+    POPCNTB,
+    POPCNTB8,
+    POPCNTD,
+    POPCNTW,
+    RLDCL,
+    RLDCL_rec,
+    RLDCR,
+    RLDCR_rec,
+    RLDIC,
+    RLDICL,
+    RLDICL_32,
+    RLDICL_32_64,
+    RLDICL_32_rec,
+    RLDICL_rec,
+    RLDICR,
+    RLDICR_32,
+    RLDICR_rec,
+    RLDIC_rec,
+    RLDIMI,
+    RLDIMI_rec,
+    RLWIMI,
+    RLWIMI8,
+    RLWIMI8_rec,
+    RLWIMI_rec,
+    RLWINM,
+    RLWINM8,
+    RLWINM8_rec,
+    RLWINM_rec,
+    RLWNM,
+    RLWNM8,
+    RLWNM8_rec,
+    RLWNM_rec,
+    SETB,
+    SETB8,
+    SETBC,
+    SETBC8,
+    SETBCR,
+    SETBCR8,
+    SETNBC,
+    SETNBC8,
+    SETNBCR,
+    SETNBCR8,
+    SLD,
+    SLD_rec,
+    SLW,
+    SLW8,
+    SLW8_rec,
+    SLW_rec,
+    SRAD,
+    SRADI,
+    SRADI_32,
+    SRAW,
+    SRAWI,
+    SRD,
+    SRD_rec,
+    SRW,
+    SRW8,
+    SRW8_rec,
+    SRW_rec,
+    SUBF,
+    SUBF8,
+    SUBF8O,
+    SUBF8_rec,
+    SUBFE,
+    SUBFE8,
+    SUBFE8O,
+    SUBFEO,
+    SUBFIC,
+    SUBFIC8,
+    SUBFME,
+    SUBFME8,
+    SUBFME8O,
+    SUBFMEO,
+    SUBFO,
+    SUBFZE,
+    SUBFZE8,
+    SUBFZE8O,
+    SUBFZEO,
+    SUBF_rec,
+    VABSDUB,
+    VABSDUH,
+    VABSDUW,
+    VADDCUW,
+    VADDSBS,
+    VADDSHS,
+    VADDSWS,
+    VADDUBM,
+    VADDUBS,
+    VADDUDM,
+    VADDUHM,
+    VADDUHS,
+    VADDUWM,
+    VADDUWS,
+    VAND,
+    VANDC,
+    VAVGSB,
+    VAVGSH,
+    VAVGSW,
+    VAVGUB,
+    VAVGUH,
+    VAVGUW,
+    VCLZB,
+    VCLZD,
+    VCLZH,
+    VCLZW,
+    VCMPBFP,
+    VCMPBFP_rec,
+    VCMPEQFP,
+    VCMPEQFP_rec,
+    VCMPEQUB,
+    VCMPEQUB_rec,
+    VCMPEQUD,
+    VCMPEQUD_rec,
+    VCMPEQUH,
+    VCMPEQUH_rec,
+    VCMPEQUQ,
+    VCMPEQUQ_rec,
+    VCMPEQUW,
+    VCMPEQUW_rec,
+    VCMPGEFP,
+    VCMPGEFP_rec,
+    VCMPGTFP,
+    VCMPGTFP_rec,
+    VCMPGTSB,
+    VCMPGTSB_rec,
+    VCMPGTSD,
+    VCMPGTSD_rec,
+    VCMPGTSH,
+    VCMPGTSH_rec,
+    VCMPGTSQ,
+    VCMPGTSQ_rec,
+    VCMPGTSW,
+    VCMPGTSW_rec,
+    VCMPGTUB,
+    VCMPGTUB_rec,
+    VCMPGTUD,
+    VCMPGTUD_rec,
+    VCMPGTUH,
+    VCMPGTUH_rec,
+    VCMPGTUQ,
+    VCMPGTUQ_rec,
+    VCMPGTUW,
+    VCMPGTUW_rec,
+    VCMPNEB,
+    VCMPNEB_rec,
+    VCMPNEH,
+    VCMPNEH_rec,
+    VCMPNEW,
+    VCMPNEW_rec,
+    VCMPNEZB,
+    VCMPNEZB_rec,
+    VCMPNEZH,
+    VCMPNEZH_rec,
+    VCMPNEZW,
+    VCMPNEZW_rec,
+    VCNTMBB,
+    VCNTMBD,
+    VCNTMBH,
+    VCNTMBW,
+    VCTZB,
+    VCTZD,
+    VCTZH,
+    VCTZW,
+    VEQV,
+    VEXPANDBM,
+    VEXPANDDM,
+    VEXPANDHM,
+    VEXPANDQM,
+    VEXPANDWM,
+    VEXTRACTBM,
+    VEXTRACTDM,
+    VEXTRACTHM,
+    VEXTRACTQM,
+    VEXTRACTWM,
+    VEXTSB2D,
+    VEXTSB2Ds,
+    VEXTSB2W,
+    VEXTSB2Ws,
+    VEXTSD2Q,
+    VEXTSH2D,
+    VEXTSH2Ds,
+    VEXTSH2W,
+    VEXTSH2Ws,
+    VEXTSW2D,
+    VEXTSW2Ds,
+    VMAXFP,
+    VMAXSB,
+    VMAXSD,
+    VMAXSH,
+    VMAXSW,
+    VMAXUB,
+    VMAXUD,
+    VMAXUH,
+    VMAXUW,
+    VMINFP,
+    VMINSB,
+    VMINSD,
+    VMINSH,
+    VMINSW,
+    VMINUB,
+    VMINUD,
+    VMINUH,
+    VMINUW,
+    VMRGEW,
+    VMRGOW,
+    VNAND,
+    VNEGD,
+    VNEGW,
+    VNOR,
+    VOR,
+    VORC,
+    VPOPCNTB,
+    VPOPCNTD,
+    VPOPCNTH,
+    VPOPCNTW,
+    VPRTYBD,
+    VPRTYBW,
+    VRLB,
+    VRLD,
+    VRLDMI,
+    VRLDNM,
+    VRLH,
+    VRLW,
+    VRLWMI,
+    VRLWNM,
+    VSEL,
+    VSHASIGMAD,
+    VSHASIGMAW,
+    VSLB,
+    VSLD,
+    VSLH,
+    VSLW,
+    VSRAB,
+    VSRAD,
+    VSRAH,
+    VSRAW,
+    VSRB,
+    VSRD,
+    VSRH,
+    VSRW,
+    VSUBCUW,
+    VSUBSBS,
+    VSUBSHS,
+    VSUBSWS,
+    VSUBUBM,
+    VSUBUBS,
+    VSUBUDM,
+    VSUBUHM,
+    VSUBUHS,
+    VSUBUWM,
+    VSUBUWS,
+    VXOR,
+    V_SET0,
+    V_SET0B,
+    V_SET0H,
+    XOR,
+    XOR8,
+    XOR8_rec,
+    XORI,
+    XORI8,
+    XORIS,
+    XORIS8,
+    XOR_rec,
+    XSABSDP,
+    XSABSQP,
+    XSCMPEQDP,
+    XSCMPGEDP,
+    XSCMPGTDP,
+    XSCPSGNDP,
+    XSCPSGNQP,
+    XSCVHPDP,
+    XSCVSPDPN,
+    XSIEXPDP,
+    XSIEXPQP,
+    XSMAXCDP,
+    XSMAXDP,
+    XSMAXJDP,
+    XSMINCDP,
+    XSMINDP,
+    XSMINJDP,
+    XSNABSDP,
+    XSNABSQP,
+    XSNEGDP,
+    XSNEGQP,
+    XSXEXPDP,
+    XSXEXPQP,
+    XSXSIGDP,
+    XVABSDP,
+    XVABSSP,
+    XVCMPEQDP,
+    XVCMPEQDP_rec,
+    XVCMPEQSP,
+    XVCMPEQSP_rec,
+    XVCMPGEDP,
+    XVCMPGEDP_rec,
+    XVCMPGESP,
+    XVCMPGESP_rec,
+    XVCMPGTDP,
+    XVCMPGTDP_rec,
+    XVCMPGTSP,
+    XVCMPGTSP_rec,
+    XVCPSGNDP,
+    XVCPSGNSP,
+    XVCVHPSP,
+    XVIEXPDP,
+    XVIEXPSP,
+    XVMAXDP,
+    XVMAXSP,
+    XVMINDP,
+    XVMINSP,
+    XVNABSDP,
+    XVNABSSP,
+    XVNEGDP,
+    XVNEGSP,
+    XVTSTDCDP,
+    XVTSTDCSP,
+    XVXEXPDP,
+    XVXEXPSP,
+    XVXSIGDP,
+    XVXSIGSP,
+    XXLAND,
+    XXLANDC,
+    XXLEQV,
+    XXLEQVOnes,
+    XXLNAND,
+    XXLNOR,
+    XXLOR,
+    XXLORC,
+    XXLORf,
+    XXLXOR,
+    XXLXORdpz,
+    XXLXORspz,
+    XXLXORz,
+    XXSEL),
+  FUSION_OP_SET(ADD4,
+    ADD4O,
+    ADD4TLS,
+    ADD4_rec,
+    ADD8,
+    ADD8O,
+    ADD8TLS,
+    ADD8TLS_,
+    ADD8_rec,
+    ADDE,
+    ADDE8,
+    ADDE8O,
+    ADDEO,
+    ADDEX,
+    ADDEX8,
+    ADDI,
+    ADDI8,
+    ADDIC,
+    ADDIC8,
+    ADDIS,
+    ADDIS8,
+    ADDISdtprelHA32,
+    ADDIStocHA,
+    ADDIStocHA8,
+    ADDIdtprelL32,
+    ADDItlsldLADDR32,
+    ADDItocL,
+    ADDME,
+    ADDME8,
+    ADDME8O,
+    ADDMEO,
+    ADDZE,
+    ADDZE8,
+    ADDZE8O,
+    ADDZEO,
+    AND,
+    AND8,
+    AND8_rec,
+    ANDC,
+    ANDC8,
+    ANDC8_rec,
+    ANDC_rec,
+    ANDI8_rec,
+    ANDIS8_rec,
+    ANDIS_rec,
+    ANDI_rec,
+    AND_rec,
+    CMPB,
+    CMPB8,
+    CMPD,
+    CMPDI,
+    CMPEQB,
+    CMPLD,
+    CMPLDI,
+    CMPLW,
+    CMPLWI,
+    CMPRB,
+    CMPRB8,
+    CMPW,
+    CMPWI,
+    CNTLZD,
+    CNTLZD_rec,
+    CNTLZW,
+    CNTLZW8,
+    CNTLZW8_rec,
+    CNTLZW_rec,
+    CNTTZD,
+    CNTTZD_rec,
+    CNTTZW,
+    CNTTZW8,
+    CNTTZW8_rec,
+    CNTTZW_rec,
+    CR6SET,
+    CR6UNSET,
+    CRAND,
+    CRANDC,
+    CREQV,
+    CRNAND,
+    CRNOR,
+    CROR,
+    CRORC,
+    CRSET,
+    CRUNSET,
+    CRXOR,
+    DSS,
+    DSSALL,
+    DST,
+    DST64,
+    DSTST,
+    DSTST64,
+    DSTSTT,
+    DSTSTT64,
+    DSTT,
+    DSTT64,
+    EQV,
+    EQV8,
+    EQV8_rec,
+    EQV_rec,
+    EXTSB,
+    EXTSB8,
+    EXTSB8_32_64,
+    EXTSB8_rec,
+    EXTSB_rec,
+    EXTSH,
+    EXTSH8,
+    EXTSH8_32_64,
+    EXTSH8_rec,
+    EXTSH_rec,
+    EXTSW,
+    EXTSWSLI,
+    EXTSWSLI_32_64,
+    EXTSWSLI_32_64_rec,
+    EXTSWSLI_rec,
+    EXTSW_32,
+    EXTSW_32_64,
+    EXTSW_32_64_rec,
+    EXTSW_rec,
+    FABSD,
+    FABSS,
+    FCMPOD,
+    FCMPOS,
+    FCMPUD,
+    FCMPUS,
+    FCPSGND,
+    FCPSGNS,
+    FMR,
+    FNABSD,
+    FNABSS,
+    FNEGD,
+    FNEGS,
+    FTDIV,
+    FTSQRT,
+    ISEL,
+    ISEL8,
+    LI,
+    LI8,
+    LIS,
+    LIS8,
+    MCRF,
+    MCRXRX,
+    MFCTR,
+    MFCTR8,
+    MFLR,
+    MFLR8,
+    MFOCRF,
+    MFOCRF8,
+    MFVRD,
+    MFVRWZ,
+    MFVSRD,
+    MFVSRWZ,
+    MTCTR,
+    MTCTR8,
+    MTCTR8loop,
+    MTCTRloop,
+    MTLR,
+    MTLR8,
+    MTOCRF,
+    MTOCRF8,
+    MTVRD,
+    MTVRWA,
+    MTVRWZ,
+    MTVSRBM,
+    MTVSRBMI,
+    MTVSRD,
+    MTVSRDM,
+    MTVSRHM,
+    MTVSRQM,
+    MTVSRWA,
+    MTVSRWM,
+    MTVSRWZ,
+    NAND,
+    NAND8,
+    NAND8_rec,
+    NAND_rec,
+    NEG,
+    NEG8,
+    NEG8O,
+    NEG8_rec,
+    NEGO,
+    NEG_rec,
+    NOP,
+    NOP_GT_PWR6,
+    NOP_GT_PWR7,
+    NOR,
+    NOR8,
+    NOR8_rec,
+    NOR_rec,
+    OR,
+    OR8,
+    OR8_rec,
+    ORC,
+    ORC8,
+    ORC8_rec,
+    ORC_rec,
+    ORI,
+    ORI8,
+    ORIS,
+    ORIS8,
+    OR_rec,
+    POPCNTB,
+    POPCNTB8,
+    POPCNTD,
+    POPCNTW,
+    RLDCL,
+    RLDCL_rec,
+    RLDCR,
+    RLDCR_rec,
+    RLDIC,
+    RLDICL,
+    RLDICL_32,
+    RLDICL_32_64,
+    RLDICL_32_rec,
+    RLDICL_rec,
+    RLDICR,
+    RLDICR_32,
+    RLDICR_rec,
+    RLDIC_rec,
+    RLDIMI,
+    RLDIMI_rec,
+    RLWIMI,
+    RLWIMI8,
+    RLWIMI8_rec,
+    RLWIMI_rec,
+    RLWINM,
+    RLWINM8,
+    RLWINM8_rec,
+    RLWINM_rec,
+    RLWNM,
+    RLWNM8,
+    RLWNM8_rec,
+    RLWNM_rec,
+    SETB,
+    SETB8,
+    SETBC,
+    SETBC8,
+    SETBCR,
+    SETBCR8,
+    SETNBC,
+    SETNBC8,
+    SETNBCR,
+    SETNBCR8,
+    SLD,
+    SLD_rec,
+    SLW,
+    SLW8,
+    SLW8_rec,
+    SLW_rec,
+    SRAD,
+    SRADI,
+    SRADI_32,
+    SRAW,
+    SRAWI,
+    SRD,
+    SRD_rec,
+    SRW,
+    SRW8,
+    SRW8_rec,
+    SRW_rec,
+    SUBF,
+    SUBF8,
+    SUBF8O,
+    SUBF8_rec,
+    SUBFE,
+    SUBFE8,
+    SUBFE8O,
+    SUBFEO,
+    SUBFIC,
+    SUBFIC8,
+    SUBFME,
+    SUBFME8,
+    SUBFME8O,
+    SUBFMEO,
+    SUBFO,
+    SUBFZE,
+    SUBFZE8,
+    SUBFZE8O,
+    SUBFZEO,
+    SUBF_rec,
+    TD,
+    TDI,
+    TRAP,
+    TW,
+    TWI,
+    VABSDUB,
+    VABSDUH,
+    VABSDUW,
+    VADDCUW,
+    VADDSBS,
+    VADDSHS,
+    VADDSWS,
+    VADDUBM,
+    VADDUBS,
+    VADDUDM,
+    VADDUHM,
+    VADDUHS,
+    VADDUWM,
+    VADDUWS,
+    VAND,
+    VANDC,
+    VAVGSB,
+    VAVGSH,
+    VAVGSW,
+    VAVGUB,
+    VAVGUH,
+    VAVGUW,
+    VCLZB,
+    VCLZD,
+    VCLZH,
+    VCLZW,
+    VCMPBFP,
+    VCMPBFP_rec,
+    VCMPEQFP,
+    VCMPEQFP_rec,
+    VCMPEQUB,
+    VCMPEQUB_rec,
+    VCMPEQUD,
+    VCMPEQUD_rec,
+    VCMPEQUH,
+    VCMPEQUH_rec,
+    VCMPEQUQ,
+    VCMPEQUQ_rec,
+    VCMPEQUW,
+    VCMPEQUW_rec,
+    VCMPGEFP,
+    VCMPGEFP_rec,
+    VCMPGTFP,
+    VCMPGTFP_rec,
+    VCMPGTSB,
+    VCMPGTSB_rec,
+    VCMPGTSD,
+    VCMPGTSD_rec,
+    VCMPGTSH,
+    VCMPGTSH_rec,
+    VCMPGTSQ,
+    VCMPGTSQ_rec,
+    VCMPGTSW,
+    VCMPGTSW_rec,
+    VCMPGTUB,
+    VCMPGTUB_rec,
+    VCMPGTUD,
+    VCMPGTUD_rec,
+    VCMPGTUH,
+    VCMPGTUH_rec,
+    VCMPGTUQ,
+    VCMPGTUQ_rec,
+    VCMPGTUW,
+    VCMPGTUW_rec,
+    VCMPNEB,
+    VCMPNEB_rec,
+    VCMPNEH,
+    VCMPNEH_rec,
+    VCMPNEW,
+    VCMPNEW_rec,
+    VCMPNEZB,
+    VCMPNEZB_rec,
+    VCMPNEZH,
+    VCMPNEZH_rec,
+    VCMPNEZW,
+    VCMPNEZW_rec,
+    VCMPSQ,
+    VCMPUQ,
+    VCNTMBB,
+    VCNTMBD,
+    VCNTMBH,
+    VCNTMBW,
+    VCTZB,
+    VCTZD,
+    VCTZH,
+    VCTZW,
+    VEQV,
+    VEXPANDBM,
+    VEXPANDDM,
+    VEXPANDHM,
+    VEXPANDQM,
+    VEXPANDWM,
+    VEXTRACTBM,
+    VEXTRACTDM,
+    VEXTRACTHM,
+    VEXTRACTQM,
+    VEXTRACTWM,
+    VEXTSB2D,
+    VEXTSB2Ds,
+    VEXTSB2W,
+    VEXTSB2Ws,
+    VEXTSD2Q,
+    VEXTSH2D,
+    VEXTSH2Ds,
+    VEXTSH2W,
+    VEXTSH2Ws,
+    VEXTSW2D,
+    VEXTSW2Ds,
+    VMAXFP,
+    VMAXSB,
+    VMAXSD,
+    VMAXSH,
+    VMAXSW,
+    VMAXUB,
+    VMAXUD,
+    VMAXUH,
+    VMAXUW,
+    VMINFP,
+    VMINSB,
+    VMINSD,
+    VMINSH,
+    VMINSW,
+    VMINUB,
+    VMINUD,
+    VMINUH,
+    VMINUW,
+    VMRGEW,
+    VMRGOW,
+    VNAND,
+    VNEGD,
+    VNEGW,
+    VNOR,
+    VOR,
+    VORC,
+    VPOPCNTB,
+    VPOPCNTD,
+    VPOPCNTH,
+    VPOPCNTW,
+    VPRTYBD,
+    VPRTYBW,
+    VRLB,
+    VRLD,
+    VRLDMI,
+    VRLDNM,
+    VRLH,
+    VRLW,
+    VRLWMI,
+    VRLWNM,
+    VSEL,
+    VSHASIGMAD,
+    VSHASIGMAW,
+    VSLB,
+    VSLD,
+    VSLH,
+    VSLW,
+    VSRAB,
+    VSRAD,
+    VSRAH,
+    VSRAW,
+    VSRB,
+    VSRD,
+    VSRH,
+    VSRW,
+    VSUBCUW,
+    VSUBSBS,
+    VSUBSHS,
+    VSUBSWS,
+    VSUBUBM,
+    VSUBUBS,
+    VSUBUDM,
+    VSUBUHM,
+    VSUBUHS,
+    VSUBUWM,
+    VSUBUWS,
+    VXOR,
+    V_SET0,
+    V_SET0B,
+    V_SET0H,
+    WAIT,
+    XOR,
+    XOR8,
+    XOR8_rec,
+    XORI,
+    XORI8,
+    XORIS,
+    XORIS8,
+    XOR_rec,
+    XSABSDP,
+    XSABSQP,
+    XSCMPEQDP,
+    XSCMPEXPDP,
+    XSCMPGEDP,
+    XSCMPGTDP,
+    XSCMPODP,
+    XSCMPUDP,
+    XSCPSGNDP,
+    XSCPSGNQP,
+    XSCVHPDP,
+    XSCVSPDPN,
+    XSIEXPDP,
+    XSIEXPQP,
+    XSMAXCDP,
+    XSMAXDP,
+    XSMAXJDP,
+    XSMINCDP,
+    XSMINDP,
+    XSMINJDP,
+    XSNABSDP,
+    XSNABSQP,
+    XSNEGDP,
+    XSNEGQP,
+    XSTDIVDP,
+    XSTSQRTDP,
+    XSTSTDCDP,
+    XSTSTDCSP,
+    XSXEXPDP,
+    XSXEXPQP,
+    XSXSIGDP,
+    XVABSDP,
+    XVABSSP,
+    XVCMPEQDP,
+    XVCMPEQDP_rec,
+    XVCMPEQSP,
+    XVCMPEQSP_rec,
+    XVCMPGEDP,
+    XVCMPGEDP_rec,
+    XVCMPGESP,
+    XVCMPGESP_rec,
+    XVCMPGTDP,
+    XVCMPGTDP_rec,
+    XVCMPGTSP,
+    XVCMPGTSP_rec,
+    XVCPSGNDP,
+    XVCPSGNSP,
+    XVCVHPSP,
+    XVIEXPDP,
+    XVIEXPSP,
+    XVMAXDP,
+    XVMAXSP,
+    XVMINDP,
+    XVMINSP,
+    XVNABSDP,
+    XVNABSSP,
+    XVNEGDP,
+    XVNEGSP,
+    XVTDIVDP,
+    XVTDIVSP,
+    XVTLSBB,
+    XVTSQRTDP,
+    XVTSQRTSP,
+    XVTSTDCDP,
+    XVTSTDCSP,
+    XVXEXPDP,
+    XVXEXPSP,
+    XVXSIGDP,
+    XVXSIGSP,
+    XXLAND,
+    XXLANDC,
+    XXLEQV,
+    XXLEQVOnes,
+    XXLNAND,
+    XXLNOR,
+    XXLOR,
+    XXLORC,
+    XXLORf,
+    XXLXOR,
+    XXLXORdpz,
+    XXLXORspz,
+    XXLXORz,
+    XXSEL))
+\ No newline at end of file
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index a2664bcff4ab..ba74af5ef5f7 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4464,9 +4464,10 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
 bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
   LoadSDNode *LDN = dyn_cast<LoadSDNode>(N);
   StoreSDNode *STN = dyn_cast<StoreSDNode>(N);
+  MemIntrinsicSDNode *MIN = dyn_cast<MemIntrinsicSDNode>(N);
   SDValue AddrOp;
-  if (LDN)
-    AddrOp = LDN->getOperand(1);
+  if (LDN || (MIN && MIN->getOpcode() == PPCISD::LD_SPLAT))
+    AddrOp = N->getOperand(1);
   else if (STN)
     AddrOp = STN->getOperand(2);
 
@@ -5973,6 +5974,15 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     if (Type != MVT::v16i8 && Type != MVT::v8i16)
       break;
 
+    // If the alignment for the load is 16 or bigger, we don't need the
+    // permutated mask to get the required value. The value must be the 0
+    // element in big endian target or 7/15 in little endian target in the
+    // result vsx register of lvx instruction.
+    // Select the instruction in the .td file.
+    if (cast<MemIntrinsicSDNode>(N)->getAlign() >= Align(16) &&
+        isOffsetMultipleOf(N, 16))
+      break;
+
     SDValue ZeroReg =
         CurDAG->getRegister(Subtarget->isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                             Subtarget->isPPC64() ? MVT::i64 : MVT::i32);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index ec7e30d7e362..8d6edf07bc53 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -3500,15 +3500,16 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     if (LHS.getValueType() == MVT::v2i64) {
       // Equality can be handled by casting to the legal type for Altivec
       // comparisons, everything else needs to be expanded.
-      if (CC == ISD::SETEQ || CC == ISD::SETNE) {
-        return DAG.getNode(
-            ISD::BITCAST, dl, MVT::v2i64,
-            DAG.getSetCC(dl, MVT::v4i32,
-                         DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, LHS),
-                         DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, RHS), CC));
-      }
-
-      return SDValue();
+      if (CC != ISD::SETEQ && CC != ISD::SETNE)
+        return SDValue();
+      SDValue SetCC32 = DAG.getSetCC(
+          dl, MVT::v4i32, DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, LHS),
+          DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, RHS), CC);
+      int ShuffV[] = {1, 0, 3, 2};
+      SDValue Shuff =
+          DAG.getVectorShuffle(MVT::v4i32, dl, SetCC32, SetCC32, ShuffV);
+      return DAG.getBitcast(
+          MVT::v2i64, DAG.getNode(ISD::AND, dl, MVT::v4i32, Shuff, SetCC32));
     }
 
     // We handle most of these in the usual way.
@@ -6206,20 +6207,13 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         ArgOffset += PtrByteSize;
         continue;
       }
-      // Copy entire object into memory.  There are cases where gcc-generated
-      // code assumes it is there, even if it could be put entirely into
-      // registers.  (This is not what the doc says.)
-
-      // FIXME: The above statement is likely due to a misunderstanding of the
-      // documents.  All arguments must be copied into the parameter area BY
-      // THE CALLEE in the event that the callee takes the address of any
-      // formal argument.  That has not yet been implemented.  However, it is
-      // reasonable to use the stack area as a staging area for the register
-      // load.
-
-      // Skip this for small aggregates, as we will use the same slot for a
-      // right-justified copy, below.
-      if (Size >= 8)
+      // Copy the object to parameter save area if it can not be entirely passed 
+      // by registers.
+      // FIXME: we only need to copy the parts which need to be passed in
+      // parameter save area. For the parts passed by registers, we don't need
+      // to copy them to the stack although we need to allocate space for them
+      // in parameter save area.
+      if ((NumGPRs - GPR_idx) * PtrByteSize < Size)
         Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
                                                           CallSeqStart,
                                                           Flags, DAG, dl);
@@ -17548,14 +17542,14 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
   if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) ||
                                (ParentOp == ISD::INTRINSIC_VOID))) {
     unsigned ID = cast<ConstantSDNode>(Parent->getOperand(1))->getZExtValue();
-    assert(
-        ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) &&
-        "Only the paired load and store (lxvp/stxvp) intrinsics are valid.");
-    SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp) ? Parent->getOperand(2)
-                                                       : Parent->getOperand(3);
-    computeFlagsForAddressComputation(IntrinOp, FlagSet, DAG);
-    FlagSet |= PPC::MOF_Vector;
-    return FlagSet;
+    if ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) {
+      SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp)
+                             ? Parent->getOperand(2)
+                             : Parent->getOperand(3);
+      computeFlagsForAddressComputation(IntrinOp, FlagSet, DAG);
+      FlagSet |= PPC::MOF_Vector;
+      return FlagSet;
+    }
   }
 
   // Mark this as something we don't want to handle here if it is atomic
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 2cfd53de3290..c16e146da247 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -393,7 +393,9 @@ public:
                              MachineInstr &NewMI1,
                              MachineInstr &NewMI2) const override;
 
-  void setSpecialOperandAttr(MachineInstr &MI, uint16_t Flags) const override;
+  // PowerPC specific version of setSpecialOperandAttr that copies Flags to MI
+  // and clears nuw, nsw, and exact flags.
+  void setSpecialOperandAttr(MachineInstr &MI, uint16_t Flags) const;
 
   bool isCoalescableExtInstr(const MachineInstr &MI,
                              Register &SrcReg, Register &DstReg,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index d83ecc699b19..2340be5b5915 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -4780,6 +4780,7 @@ class PPCAsmPseudo<string asm, dag iops>
 def : InstAlias<"sc", (SC 0)>;
 
 def : InstAlias<"sync", (SYNC 0)>, Requires<[HasSYNC]>;
+def : InstAlias<"hwsync", (SYNC 0), 0>, Requires<[HasSYNC]>;
 def : InstAlias<"msync", (SYNC 0), 0>, Requires<[HasSYNC]>;
 def : InstAlias<"lwsync", (SYNC 1)>, Requires<[HasSYNC]>;
 def : InstAlias<"ptesync", (SYNC 2)>, Requires<[HasSYNC]>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index d92a10c5b208..110f7d79fbc5 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -158,6 +158,11 @@ def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">;
 def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">;
 def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">;
 
+def PPCldsplatAlign16 : PatFrag<(ops node:$ptr), (PPCldsplat node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getAlign() >= Align(16) &&
+         isOffsetMultipleOf(N, 16);
+}]>;
+
 //--------------------- VSX-specific instruction formats ---------------------//
 // By default, all VSX instructions are to be selected over their Altivec
 // counter parts and they do not have unmodeled sideeffects.
@@ -3180,6 +3185,12 @@ defm : ScalToVecWPermute<
   v2f64, (f64 (load ForceXForm:$src)),
   (XXPERMDIs (XFLOADf64 ForceXForm:$src), 2),
   (SUBREG_TO_REG (i64 1), (XFLOADf64 ForceXForm:$src), sub_64)>;
+
+// Splat loads.
+def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)),
+          (v8i16 (VSPLTH 7, (LVX ForceXForm:$A)))>;
+def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)),
+          (v16i8 (VSPLTB 15, (LVX ForceXForm:$A)))>;
 } // HasVSX, NoP9Vector, IsLittleEndian
 
 let Predicates = [HasVSX, NoP9Vector, IsBigEndian] in {
@@ -3187,6 +3198,12 @@ let Predicates = [HasVSX, NoP9Vector, IsBigEndian] in {
             (LXVD2X ForceXForm:$src)>;
   def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, ForceXForm:$dst),
             (STXVD2X $rS, ForceXForm:$dst)>;
+
+  // Splat loads.
+  def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)),
+            (v8i16 (VSPLTH 0, (LVX ForceXForm:$A)))>;
+  def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)),
+            (v16i8 (VSPLTB 0, (LVX ForceXForm:$A)))>;
 } // HasVSX, NoP9Vector, IsBigEndian
 
 // Any VSX subtarget that only has loads and stores that load in big endian
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 7f63827afbd6..0c7be96a0595 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -413,9 +413,9 @@ bool PPCLoopInstrFormPrep::runOnFunction(Function &F) {
 
   bool MadeChange = false;
 
-  for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I)
-    for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L)
-      MadeChange |= runOnLoop(*L);
+  for (Loop *I : *LI)
+    for (Loop *L : depth_first(I))
+      MadeChange |= runOnLoop(L);
 
   return MadeChange;
 }
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
index e4954b722fd0..6b8ad22639c8 100644
--- a/llvm/lib/Target/PowerPC/PPCMacroFusion.def
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
@@ -153,5 +153,7 @@ FUSION_FEATURE(ZeroMoveLR, hasZeroMoveFusion, -1,
                FUSION_OP_SET(MTLR8, MTLR, MTSPR8, MTSPR),
                FUSION_OP_SET(BCLR, BCLRn, gBCLR, BCLRL, BCLRLn, gBCLRL))
 
+#include "PPCBack2BackFusion.def"
+
 #undef FUSION_FEATURE
 #undef FUSION_OP_SET
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 1258a1281597..f11b4e14073e 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -135,6 +135,7 @@ void PPCSubtarget::initializeEnvironment() {
   HasCompareFusion = false;
   HasWideImmFusion = false;
   HasZeroMoveFusion = false;
+  HasBack2BackFusion = false;
   IsISA2_06 = false;
   IsISA2_07 = false;
   IsISA3_0 = false;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index d52833cb1465..1300b62b623a 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -155,6 +155,7 @@ protected:
   bool HasCompareFusion;
   bool HasWideImmFusion;
   bool HasZeroMoveFusion;
+  bool HasBack2BackFusion;
   bool IsISA2_06;
   bool IsISA2_07;
   bool IsISA3_0;
@@ -348,6 +349,7 @@ public:
   bool hasWideImmFusion() const { return HasWideImmFusion; }
   bool hasSha3Fusion() const { return HasSha3Fusion; }
   bool hasZeroMoveFusion() const { return HasZeroMoveFusion; }
+  bool hasBack2BackFusion() const { return HasBack2BackFusion; }
   bool needsSwapsForVSXMemOps() const {
     return hasVSX() && isLittleEndian() && !hasP9Vector();
   }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 5d6f58a77a39..ed28731b8ef2 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -328,10 +328,6 @@ static bool isMMAType(Type *Ty) {
 InstructionCost PPCTTIImpl::getUserCost(const User *U,
                                         ArrayRef<const Value *> Operands,
                                         TTI::TargetCostKind CostKind) {
-  // Set the max cost if an MMA type is present (v256i1, v512i1).
-  if (isMMAType(U->getType()))
-    return InstructionCost::getMax();
-
   // We already implement getCastInstrCost and getMemoryOpCost where we perform
   // the vector adjustment there.
   if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
@@ -1276,23 +1272,21 @@ PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
-bool PPCTTIImpl::areFunctionArgsABICompatible(
-    const Function *Caller, const Function *Callee,
-    SmallPtrSetImpl<Argument *> &Args) const {
+bool PPCTTIImpl::areTypesABICompatible(const Function *Caller,
+                                       const Function *Callee,
+                                       const ArrayRef<Type *> &Types) const {
 
   // We need to ensure that argument promotion does not
   // attempt to promote pointers to MMA types (__vector_pair
   // and __vector_quad) since these types explicitly cannot be
   // passed as arguments. Both of these types are larger than
   // the 128-bit Altivec vectors and have a scalar size of 1 bit.
-  if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
+  if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
     return false;
 
-  return llvm::none_of(Args, [](Argument *A) {
-    auto *EltTy = cast<PointerType>(A->getType())->getElementType();
-    if (EltTy->isSized())
-      return (EltTy->isIntOrIntVectorTy(1) &&
-              EltTy->getPrimitiveSizeInBits() > 128);
+  return llvm::none_of(Types, [](Type *Ty) {
+    if (Ty->isSized())
+      return Ty->isIntOrIntVectorTy(1) && Ty->getPrimitiveSizeInBits() > 128;
     return false;
   });
 }
@@ -1388,3 +1382,86 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
 
   return false;
 }
+
+bool PPCTTIImpl::hasActiveVectorLength(unsigned Opcode, Type *DataType,
+                                       Align Alignment) const {
+  // Only load and stores instructions can have variable vector length on Power.
+  if (Opcode != Instruction::Load && Opcode != Instruction::Store)
+    return false;
+  // Loads/stores with length instructions use bits 0-7 of the GPR operand and
+  // therefore cannot be used in 32-bit mode.
+  if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64())
+    return false;
+  if (isa<FixedVectorType>(DataType)) {
+    unsigned VecWidth = DataType->getPrimitiveSizeInBits();
+    return VecWidth == 128;
+  }
+  Type *ScalarTy = DataType->getScalarType();
+
+  if (ScalarTy->isPointerTy())
+    return true;
+
+  if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
+    return true;
+
+  if (!ScalarTy->isIntegerTy())
+    return false;
+
+  unsigned IntWidth = ScalarTy->getIntegerBitWidth();
+  return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64;
+}
+
+InstructionCost PPCTTIImpl::getVPMemoryOpCost(unsigned Opcode, Type *Src,
+                                              Align Alignment,
+                                              unsigned AddressSpace,
+                                              TTI::TargetCostKind CostKind,
+                                              const Instruction *I) {
+  InstructionCost Cost = BaseT::getVPMemoryOpCost(Opcode, Src, Alignment,
+                                                  AddressSpace, CostKind, I);
+  if (TLI->getValueType(DL, Src, true) == MVT::Other)
+    return Cost;
+  // TODO: Handle other cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return Cost;
+
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Invalid Opcode");
+
+  auto *SrcVTy = dyn_cast<FixedVectorType>(Src);
+  assert(SrcVTy && "Expected a vector type for VP memory operations");
+
+  if (hasActiveVectorLength(Opcode, Src, Alignment)) {
+    std::pair<InstructionCost, MVT> LT =
+        TLI->getTypeLegalizationCost(DL, SrcVTy);
+
+    InstructionCost CostFactor =
+        vectorCostAdjustmentFactor(Opcode, Src, nullptr);
+    if (!CostFactor.isValid())
+      return InstructionCost::getMax();
+
+    InstructionCost Cost = LT.first * CostFactor;
+    assert(Cost.isValid() && "Expected valid cost");
+
+    // On P9 but not on P10, if the op is misaligned then it will cause a
+    // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked
+    // ones.
+    const Align DesiredAlignment(16);
+    if (Alignment >= DesiredAlignment || ST->getCPUDirective() != PPC::DIR_PWR9)
+      return Cost;
+
+    // Since alignment may be under estimated, we try to compute the probability
+    // that the actual address is aligned to the desired boundary. For example
+    // an 8-byte aligned load is assumed to be actually 16-byte aligned half the
+    // time, while a 4-byte aligned load has a 25% chance of being 16-byte
+    // aligned.
+    float AlignmentProb = ((float)Alignment.value()) / DesiredAlignment.value();
+    float MisalignmentProb = 1.0 - AlignmentProb;
+    return (MisalignmentProb * P9PipelineFlushEstimate) +
+           (AlignmentProb * *Cost.getValue());
+  }
+
+  // Usually we should not get to this point, but the following is an attempt to
+  // model the cost of legalization. Currently we can only lower intrinsics with
+  // evl but no mask, on Power 9/10. Otherwise, we must scalarize.
+  return getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
+}
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 7aeb0c59d503..0af6f2a308d9 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -134,9 +134,19 @@ public:
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
-  bool areFunctionArgsABICompatible(const Function *Caller,
-                                    const Function *Callee,
-                                    SmallPtrSetImpl<Argument *> &Args) const;
+  bool areTypesABICompatible(const Function *Caller, const Function *Callee,
+                             const ArrayRef<Type *> &Types) const;
+  bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
+                             Align Alignment) const;
+  InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
+                                    unsigned AddressSpace,
+                                    TTI::TargetCostKind CostKind,
+                                    const Instruction *I = nullptr);
+
+private:
+  // The following constant is used for estimating costs on power9.
+  static const InstructionCost::CostType P9PipelineFlushEstimate = 80;
+
   /// @}
 };
 
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index f00813f1301a..75592dd4c6f5 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -169,6 +169,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseJALOffset(OperandVector &Operands);
   OperandMatchResultTy parseVTypeI(OperandVector &Operands);
   OperandMatchResultTy parseMaskReg(OperandVector &Operands);
+  OperandMatchResultTy parseInsnDirectiveOpcode(OperandVector &Operands);
 
   bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
 
@@ -827,6 +828,7 @@ public:
     Op->SysReg.Length = Str.size();
     Op->SysReg.Encoding = Encoding;
     Op->StartLoc = S;
+    Op->EndLoc = S;
     Op->IsRV64 = IsRV64;
     return Op;
   }
@@ -836,6 +838,7 @@ public:
     auto Op = std::make_unique<RISCVOperand>(KindTy::VType);
     Op->VType.Val = VTypeI;
     Op->StartLoc = S;
+    Op->EndLoc = S;
     Op->IsRV64 = IsRV64;
     return Op;
   }
@@ -1291,7 +1294,7 @@ OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands,
     if (HadParens)
       Operands.push_back(RISCVOperand::createToken("(", FirstS, isRV64()));
     SMLoc S = getLoc();
-    SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+    SMLoc E = SMLoc::getFromPointer(S.getPointer() + Name.size());
     getLexer().Lex();
     Operands.push_back(RISCVOperand::createReg(RegNo, S, E, isRV64()));
   }
@@ -1305,6 +1308,67 @@ OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands,
 }
 
 OperandMatchResultTy
+RISCVAsmParser::parseInsnDirectiveOpcode(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  SMLoc E;
+  const MCExpr *Res;
+
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+  case AsmToken::LParen:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+  case AsmToken::Exclaim:
+  case AsmToken::Tilde:
+  case AsmToken::Integer:
+  case AsmToken::String: {
+    if (getParser().parseExpression(Res, E))
+      return MatchOperand_ParseFail;
+
+    auto *CE = dyn_cast<MCConstantExpr>(Res);
+    if (CE) {
+      int64_t Imm = CE->getValue();
+      if (isUInt<7>(Imm)) {
+        Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
+        return MatchOperand_Success;
+      }
+    }
+
+    Twine Msg = "immediate must be an integer in the range";
+    Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 7) - 1) + "]");
+    return MatchOperand_ParseFail;
+  }
+  case AsmToken::Identifier: {
+    StringRef Identifier;
+    if (getParser().parseIdentifier(Identifier))
+      return MatchOperand_ParseFail;
+
+    auto Opcode = RISCVInsnOpcode::lookupRISCVOpcodeByName(Identifier);
+    if (Opcode) {
+      Res = MCConstantExpr::create(Opcode->Value, getContext());
+      E = SMLoc::getFromPointer(S.getPointer() + Identifier.size());
+      Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
+      return MatchOperand_Success;
+    }
+
+    Twine Msg = "operand must be a valid opcode name or an "
+                "integer in the range";
+    Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 7) - 1) + "]");
+    return MatchOperand_ParseFail;
+  }
+  case AsmToken::Percent: {
+    // Discard operand with modifier.
+    Twine Msg = "immediate must be an integer in the range";
+    Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 7) - 1) + "]");
+    return MatchOperand_ParseFail;
+  }
+  }
+
+  return MatchOperand_NoMatch;
+}
+
+OperandMatchResultTy
 RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
   SMLoc S = getLoc();
   const MCExpr *Res;
@@ -1381,7 +1445,7 @@ RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
 
 OperandMatchResultTy RISCVAsmParser::parseImmediate(OperandVector &Operands) {
   SMLoc S = getLoc();
-  SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+  SMLoc E;
   const MCExpr *Res;
 
   switch (getLexer().getKind()) {
@@ -1396,7 +1460,7 @@ OperandMatchResultTy RISCVAsmParser::parseImmediate(OperandVector &Operands) {
   case AsmToken::Integer:
   case AsmToken::String:
   case AsmToken::Identifier:
-    if (getParser().parseExpression(Res))
+    if (getParser().parseExpression(Res, E))
       return MatchOperand_ParseFail;
     break;
   case AsmToken::Percent:
@@ -1410,7 +1474,7 @@ OperandMatchResultTy RISCVAsmParser::parseImmediate(OperandVector &Operands) {
 OperandMatchResultTy
 RISCVAsmParser::parseOperandWithModifier(OperandVector &Operands) {
   SMLoc S = getLoc();
-  SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+  SMLoc E;
 
   if (getLexer().getKind() != AsmToken::Percent) {
     Error(getLoc(), "expected '%' for operand modifier");
@@ -1449,7 +1513,6 @@ RISCVAsmParser::parseOperandWithModifier(OperandVector &Operands) {
 
 OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
   SMLoc S = getLoc();
-  SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
   const MCExpr *Res;
 
   if (getLexer().getKind() != AsmToken::Identifier)
@@ -1461,6 +1524,8 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
   if (getParser().parseIdentifier(Identifier))
     return MatchOperand_ParseFail;
 
+  SMLoc E = SMLoc::getFromPointer(S.getPointer() + Identifier.size());
+
   if (Identifier.consume_back("@plt")) {
     Error(getLoc(), "'@plt' operand not valid for instruction");
     return MatchOperand_ParseFail;
@@ -1492,7 +1557,7 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
   }
 
   const MCExpr *Expr;
-  if (getParser().parseExpression(Expr))
+  if (getParser().parseExpression(Expr, E))
     return MatchOperand_ParseFail;
   Res = MCBinaryExpr::create(Opcode, Res, Expr, getContext());
   Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
@@ -1501,7 +1566,6 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
 
 OperandMatchResultTy RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {
   SMLoc S = getLoc();
-  SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
   const MCExpr *Res;
 
   if (getLexer().getKind() != AsmToken::Identifier)
@@ -1515,6 +1579,8 @@ OperandMatchResultTy RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {
   if (getParser().parseIdentifier(Identifier))
     return MatchOperand_ParseFail;
 
+  SMLoc E = SMLoc::getFromPointer(S.getPointer() + Identifier.size());
+
   RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL;
   if (Identifier.consume_back("@plt"))
     Kind = RISCVMCExpr::VK_RISCV_CALL_PLT;
@@ -1529,10 +1595,10 @@ OperandMatchResultTy RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {
 OperandMatchResultTy
 RISCVAsmParser::parsePseudoJumpSymbol(OperandVector &Operands) {
   SMLoc S = getLoc();
-  SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+  SMLoc E;
   const MCExpr *Res;
 
-  if (getParser().parseExpression(Res))
+  if (getParser().parseExpression(Res, E))
     return MatchOperand_ParseFail;
 
   if (Res->getKind() != MCExpr::ExprKind::SymbolRef ||
@@ -1662,7 +1728,7 @@ OperandMatchResultTy RISCVAsmParser::parseMaskReg(OperandVector &Operands) {
     if (RegNo != RISCV::V0)
       return MatchOperand_NoMatch;
     SMLoc S = getLoc();
-    SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+    SMLoc E = SMLoc::getFromPointer(S.getPointer() + Name.size());
     getLexer().Lex();
     Operands.push_back(RISCVOperand::createReg(RegNo, S, E, isRV64()));
   }
@@ -2062,7 +2128,11 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
                         "unexpected token in '.attribute' directive"))
     return true;
 
-  if (Tag == RISCVAttrs::ARCH) {
+  if (IsIntegerValue)
+    getTargetStreamer().emitAttribute(Tag, IntegerValue);
+  else if (Tag != RISCVAttrs::ARCH)
+    getTargetStreamer().emitTextAttribute(Tag, StringValue);
+  else {
     StringRef Arch = StringValue;
     for (auto Feature : RISCVFeatureKV)
       if (llvm::RISCVISAInfo::isSupportedExtensionFeature(Feature.Key))
@@ -2070,7 +2140,7 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
 
     auto ParseResult = llvm::RISCVISAInfo::parseArchString(
         StringValue, /*EnableExperimentalExtension=*/true,
-        /*ExperimentalExtensionVersionCheck=*/false);
+        /*ExperimentalExtensionVersionCheck=*/true);
     if (!ParseResult) {
       std::string Buffer;
       raw_string_ostream OutputErrMsg(Buffer);
@@ -2093,35 +2163,9 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
       setFeatureBits(RISCV::Feature64Bit, "64bit");
     else
       return Error(ValueExprLoc, "bad arch string " + Arch);
-  }
-
-  if (IsIntegerValue)
-    getTargetStreamer().emitAttribute(Tag, IntegerValue);
-  else {
-    if (Tag != RISCVAttrs::ARCH) {
-      getTargetStreamer().emitTextAttribute(Tag, StringValue);
-    } else {
-      std::vector<std::string> FeatureVector;
-      RISCVFeatures::toFeatureVector(FeatureVector, getSTI().getFeatureBits());
 
-      // Parse that by RISCVISAInfo->
-      unsigned XLen = getFeatureBits(RISCV::Feature64Bit) ? 64 : 32;
-      auto ParseResult = llvm::RISCVISAInfo::parseFeatures(XLen, FeatureVector);
-      if (!ParseResult) {
-        std::string Buffer;
-        raw_string_ostream OutputErrMsg(Buffer);
-        handleAllErrors(ParseResult.takeError(),
-                        [&](llvm::StringError &ErrMsg) {
-                          OutputErrMsg << ErrMsg.getMessage();
-                        });
-
-        return Error(ValueExprLoc, OutputErrMsg.str());
-      }
-      auto &ISAInfo = *ParseResult;
-
-      // Then emit the arch string.
-      getTargetStreamer().emitTextAttribute(Tag, ISAInfo->toString());
-    }
+    // Then emit the arch string.
+    getTargetStreamer().emitTextAttribute(Tag, ISAInfo->toString());
   }
 
   return false;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index 0aba18b20f0d..144e761f002d 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -27,6 +27,11 @@ namespace RISCVSysReg {
 #include "RISCVGenSearchableTables.inc"
 } // namespace RISCVSysReg
 
+namespace RISCVInsnOpcode {
+#define GET_RISCVOpcodesList_IMPL
+#include "RISCVGenSearchableTables.inc"
+} // namespace RISCVInsnOpcode
+
 namespace RISCVABI {
 ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
                      StringRef ABIName) {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index d8f4403c824f..9cfd36745f46 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -299,6 +299,16 @@ struct SysReg {
 #include "RISCVGenSearchableTables.inc"
 } // end namespace RISCVSysReg
 
+namespace RISCVInsnOpcode {
+struct RISCVOpcode {
+  const char *Name;
+  unsigned Value;
+};
+
+#define GET_RISCVOpcodesList_DECL
+#include "RISCVGenSearchableTables.inc"
+} // end namespace RISCVInsnOpcode
+
 namespace RISCVABI {
 
 enum ABI {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index f1c3810f4ee5..89a7d54f60f8 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -171,9 +171,9 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo,
                                    const MCSubtargetInfo &STI, raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNo).getImm();
   // Print the raw immediate for reserved values: vlmul[2:0]=4, vsew[2:0]=0b1xx,
-  // or non-zero bits 8/9/10.
+  // or non-zero in bits 8 and above.
   if (RISCVVType::getVLMUL(Imm) == RISCVII::VLMUL::LMUL_RESERVED ||
-      RISCVVType::getSEW(Imm) > 64 || (Imm & 0x700) != 0) {
+      RISCVVType::getSEW(Imm) > 64 || (Imm >> 8) != 0) {
     O << Imm;
     return;
   }
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 772a4f8ecd53..6aa915c01929 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -168,14 +168,6 @@ def HasStdExtZvlsseg : Predicate<"Subtarget->hasStdExtZvlsseg()">,
                                  AssemblerPredicate<(all_of FeatureStdExtZvlsseg),
                                  "'Zvlsseg' (Vector segment load/store instructions)">;
 
-def FeatureStdExtZvamo
-    : SubtargetFeature<"experimental-zvamo", "HasStdExtZvamo", "true",
-                       "'Zvamo' (Vector AMO Operations)",
-                       [FeatureStdExtV]>;
-def HasStdExtZvamo : Predicate<"Subtarget->hasStdExtZvamo()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZvamo),
-                               "'Zvamo' (Vector AMO Operations)">;
-
 def Feature64Bit
     : SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">;
 def IsRV64 : Predicate<"Subtarget->is64Bit()">,
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 66a34d73dd37..b24eb5f7bbf4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -718,6 +718,71 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
     break;
   }
+  case ISD::MUL: {
+    // Special case for calculating (mul (and X, C2), C1) where the full product
+    // fits in XLen bits. We can shift X left by the number of leading zeros in
+    // C2 and shift C1 left by XLen-lzcnt(C2). This will ensure the final
+    // product has XLen trailing zeros, putting it in the output of MULHU. This
+    // can avoid materializing a constant in a register for C2.
+
+    // RHS should be a constant.
+    auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+    if (!N1C || !N1C->hasOneUse())
+      break;
+
+    // LHS should be an AND with constant.
+    SDValue N0 = Node->getOperand(0);
+    if (N0.getOpcode() != ISD::AND || !isa<ConstantSDNode>(N0.getOperand(1)))
+      break;
+
+    uint64_t C2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+
+    // Constant should be a mask.
+    if (!isMask_64(C2))
+      break;
+
+    // This should be the only use of the AND unless we will use
+    // (SRLI (SLLI X, 32), 32). We don't use a shift pair for other AND
+    // constants.
+    if (!N0.hasOneUse() && C2 != UINT64_C(0xFFFFFFFF))
+      break;
+
+    // If this can be an ANDI, ZEXT.H or ZEXT.W we don't need to do this
+    // optimization.
+    if (isInt<12>(C2) ||
+        (C2 == UINT64_C(0xFFFF) &&
+         (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp())) ||
+        (C2 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba()))
+      break;
+
+    // We need to shift left the AND input and C1 by a total of XLen bits.
+
+    // How far left do we need to shift the AND input?
+    unsigned XLen = Subtarget->getXLen();
+    unsigned LeadingZeros = XLen - (64 - countLeadingZeros(C2));
+
+    // The constant gets shifted by the remaining amount unless that would
+    // shift bits out.
+    uint64_t C1 = N1C->getZExtValue();
+    unsigned ConstantShift = XLen - LeadingZeros;
+    if (ConstantShift > (XLen - (64 - countLeadingZeros(C1))))
+      break;
+
+    uint64_t ShiftedC1 = C1 << ConstantShift;
+    // If this RV32, we need to sign extend the constant.
+    if (XLen == 32)
+      ShiftedC1 = SignExtend64(ShiftedC1, 32);
+
+    // Create (mulhu (slli X, lzcnt(C2)), C1 << (XLen - lzcnt(C2))).
+    SDNode *Imm = selectImm(CurDAG, DL, ShiftedC1, *Subtarget);
+    SDNode *SLLI =
+        CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0),
+                               CurDAG->getTargetConstant(LeadingZeros, DL, VT));
+    SDNode *MULHU = CurDAG->getMachineNode(RISCV::MULHU, DL, VT,
+                                           SDValue(SLLI, 0), SDValue(Imm, 0));
+    ReplaceNode(Node, MULHU);
+    return;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntNo = Node->getConstantOperandVal(0);
     switch (IntNo) {
@@ -1450,6 +1515,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     ReplaceNode(Node, Extract.getNode());
     return;
   }
+  case ISD::SPLAT_VECTOR:
   case RISCVISD::VMV_V_X_VL:
   case RISCVISD::VFMV_V_F_VL: {
     // Try to match splat of a scalar load to a strided load with stride of x0.
@@ -1466,7 +1532,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       break;
 
     SDValue VL;
-    selectVLOp(Node->getOperand(1), VL);
+    if (Node->getOpcode() == ISD::SPLAT_VECTOR)
+      VL = CurDAG->getTargetConstant(RISCV::VLMaxSentinel, DL, XLenVT);
+    else
+      selectVLOp(Node->getOperand(1), VL);
 
     unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
     SDValue SEW = CurDAG->getTargetConstant(Log2SEW, DL, XLenVT);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f3331571fc55..4f5512e6fb37 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -330,6 +330,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::LLRINT, MVT::f16, Legal);
     setOperationAction(ISD::LROUND, MVT::f16, Legal);
     setOperationAction(ISD::LLROUND, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FMA, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Legal);
     for (auto CC : FPCCToExpand)
       setCondCodeAction(CC, MVT::f16, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
@@ -367,6 +375,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::LLRINT, MVT::f32, Legal);
     setOperationAction(ISD::LROUND, MVT::f32, Legal);
     setOperationAction(ISD::LLROUND, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
     for (auto CC : FPCCToExpand)
       setCondCodeAction(CC, MVT::f32, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
@@ -388,6 +402,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::LLRINT, MVT::f64, Legal);
     setOperationAction(ISD::LROUND, MVT::f64, Legal);
     setOperationAction(ISD::LLROUND, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
     for (auto CC : FPCCToExpand)
       setCondCodeAction(CC, MVT::f64, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
@@ -412,6 +434,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_UINT_SAT, XLenVT, Custom);
     setOperationAction(ISD::FP_TO_SINT_SAT, XLenVT, Custom);
 
+    setOperationAction(ISD::STRICT_FP_TO_UINT, XLenVT, Legal);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, XLenVT, Legal);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, XLenVT, Legal);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, XLenVT, Legal);
+
     setOperationAction(ISD::FLT_ROUNDS_, XLenVT, Custom);
     setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
   }
@@ -471,12 +498,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_XOR,         ISD::VP_ASHR,        ISD::VP_LSHR,
         ISD::VP_SHL,         ISD::VP_REDUCE_ADD,  ISD::VP_REDUCE_AND,
         ISD::VP_REDUCE_OR,   ISD::VP_REDUCE_XOR,  ISD::VP_REDUCE_SMAX,
-        ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN};
+        ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN,
+        ISD::VP_SELECT};
 
     static const unsigned FloatingPointVPOps[] = {
         ISD::VP_FADD,        ISD::VP_FSUB,        ISD::VP_FMUL,
         ISD::VP_FDIV,        ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD,
-        ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX};
+        ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX, ISD::VP_SELECT};
 
     if (!Subtarget.is64Bit()) {
       // We must custom-lower certain vXi64 operations on RV32 due to the vector
@@ -519,6 +547,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SELECT_CC, VT, Expand);
       setOperationAction(ISD::VSELECT, VT, Expand);
 
+      setOperationAction(ISD::VP_AND, VT, Custom);
+      setOperationAction(ISD::VP_OR, VT, Custom);
+      setOperationAction(ISD::VP_XOR, VT, Custom);
+
       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
@@ -803,6 +835,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
         // Operations below are different for between masks and other vectors.
         if (VT.getVectorElementType() == MVT::i1) {
+          setOperationAction(ISD::VP_AND, VT, Custom);
+          setOperationAction(ISD::VP_OR, VT, Custom);
+          setOperationAction(ISD::VP_XOR, VT, Custom);
           setOperationAction(ISD::AND, VT, Custom);
           setOperationAction(ISD::OR, VT, Custom);
           setOperationAction(ISD::XOR, VT, Custom);
@@ -1147,7 +1182,7 @@ bool RISCVTargetLowering::isCheapToSpeculateCtlz() const {
   return Subtarget.hasStdExtZbb();
 }
 
-bool RISCVTargetLowering::hasAndNot(SDValue Y) const {
+bool RISCVTargetLowering::hasAndNotCompare(SDValue Y) const {
   EVT VT = Y.getValueType();
 
   // FIXME: Support vectors once we have tests.
@@ -1235,7 +1270,8 @@ bool RISCVTargetLowering::shouldSinkOperands(
 
 bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
                                        bool ForCodeSize) const {
-  if (VT == MVT::f16 && !Subtarget.hasStdExtZfhmin())
+  // FIXME: Change to Zfhmin once f16 becomes a legal type with Zfhmin.
+  if (VT == MVT::f16 && !Subtarget.hasStdExtZfh())
     return false;
   if (VT == MVT::f32 && !Subtarget.hasStdExtF())
     return false;
@@ -1255,9 +1291,10 @@ bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
 MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
                                                       CallingConv::ID CC,
                                                       EVT VT) const {
-  // Use f32 to pass f16 if it is legal and Zfhmin/Zfh is not enabled.
+  // Use f32 to pass f16 if it is legal and Zfh is not enabled.
   // We might still end up using a GPR but that will be decided based on ABI.
-  if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfhmin())
+  // FIXME: Change to Zfhmin once f16 becomes a legal type with Zfhmin.
+  if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfh())
     return MVT::f32;
 
   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
@@ -1266,9 +1303,10 @@ MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
 unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
                                                            CallingConv::ID CC,
                                                            EVT VT) const {
-  // Use f32 to pass f16 if it is legal and Zfhmin/Zfh is not enabled.
+  // Use f32 to pass f16 if it is legal and Zfh is not enabled.
   // We might still end up using a GPR but that will be decided based on ABI.
-  if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfhmin())
+  // FIXME: Change to Zfhmin once f16 becomes a legal type with Zfhmin.
+  if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfh())
     return 1;
 
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
@@ -1959,29 +1997,37 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     int64_t StepNumerator = SimpleVID->StepNumerator;
     unsigned StepDenominator = SimpleVID->StepDenominator;
     int64_t Addend = SimpleVID->Addend;
+
+    assert(StepNumerator != 0 && "Invalid step");
+    bool Negate = false;
+    int64_t SplatStepVal = StepNumerator;
+    unsigned StepOpcode = ISD::MUL;
+    if (StepNumerator != 1) {
+      if (isPowerOf2_64(std::abs(StepNumerator))) {
+        Negate = StepNumerator < 0;
+        StepOpcode = ISD::SHL;
+        SplatStepVal = Log2_64(std::abs(StepNumerator));
+      }
+    }
+
     // Only emit VIDs with suitably-small steps/addends. We use imm5 is a
     // threshold since it's the immediate value many RVV instructions accept.
-    if (isInt<5>(StepNumerator) && isPowerOf2_32(StepDenominator) &&
-        isInt<5>(Addend)) {
+    // There is no vmul.vi instruction so ensure multiply constant can fit in
+    // a single addi instruction.
+    if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) ||
+         (StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) &&
+        isPowerOf2_32(StepDenominator) && isInt<5>(Addend)) {
       SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, ContainerVT, Mask, VL);
       // Convert right out of the scalable type so we can use standard ISD
       // nodes for the rest of the computation. If we used scalable types with
       // these, we'd lose the fixed-length vector info and generate worse
       // vsetvli code.
       VID = convertFromScalableVector(VT, VID, DAG, Subtarget);
-      assert(StepNumerator != 0 && "Invalid step");
-      bool Negate = false;
-      if (StepNumerator != 1) {
-        int64_t SplatStepVal = StepNumerator;
-        unsigned Opcode = ISD::MUL;
-        if (isPowerOf2_64(std::abs(StepNumerator))) {
-          Negate = StepNumerator < 0;
-          Opcode = ISD::SHL;
-          SplatStepVal = Log2_64(std::abs(StepNumerator));
-        }
+      if ((StepOpcode == ISD::MUL && SplatStepVal != 1) ||
+          (StepOpcode == ISD::SHL && SplatStepVal != 0)) {
         SDValue SplatStep = DAG.getSplatVector(
             VT, DL, DAG.getConstant(SplatStepVal, DL, XLenVT));
-        VID = DAG.getNode(Opcode, DL, VT, VID, SplatStep);
+        VID = DAG.getNode(StepOpcode, DL, VT, VID, SplatStep);
       }
       if (StepDenominator != 1) {
         SDValue SplatStep = DAG.getSplatVector(
@@ -3133,6 +3179,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerGET_ROUNDING(Op, DAG);
   case ISD::SET_ROUNDING:
     return lowerSET_ROUNDING(Op, DAG);
+  case ISD::VP_SELECT:
+    return lowerVPOp(Op, DAG, RISCVISD::VSELECT_VL);
   case ISD::VP_ADD:
     return lowerVPOp(Op, DAG, RISCVISD::ADD_VL);
   case ISD::VP_SUB:
@@ -3148,11 +3196,11 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::VP_UREM:
     return lowerVPOp(Op, DAG, RISCVISD::UREM_VL);
   case ISD::VP_AND:
-    return lowerVPOp(Op, DAG, RISCVISD::AND_VL);
+    return lowerLogicVPOp(Op, DAG, RISCVISD::VMAND_VL, RISCVISD::AND_VL);
   case ISD::VP_OR:
-    return lowerVPOp(Op, DAG, RISCVISD::OR_VL);
+    return lowerLogicVPOp(Op, DAG, RISCVISD::VMOR_VL, RISCVISD::OR_VL);
   case ISD::VP_XOR:
-    return lowerVPOp(Op, DAG, RISCVISD::XOR_VL);
+    return lowerLogicVPOp(Op, DAG, RISCVISD::VMXOR_VL, RISCVISD::XOR_VL);
   case ISD::VP_ASHR:
     return lowerVPOp(Op, DAG, RISCVISD::SRA_VL);
   case ISD::VP_LSHR:
@@ -4469,19 +4517,19 @@ SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
   }
 
   MVT M1VT = getLMUL1VT(ContainerVT);
+  MVT XLenVT = Subtarget.getXLenVT();
 
   SDValue Mask, VL;
   std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
 
-  // FIXME: This is a VLMAX splat which might be too large and can prevent
-  // vsetvli removal.
   SDValue NeutralElem =
       DAG.getNeutralElement(BaseOpc, DL, VecEltVT, SDNodeFlags());
-  SDValue IdentitySplat = DAG.getSplatVector(M1VT, DL, NeutralElem);
+  SDValue IdentitySplat = lowerScalarSplat(
+      NeutralElem, DAG.getConstant(1, DL, XLenVT), M1VT, DL, DAG, Subtarget);
   SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT), Vec,
                                   IdentitySplat, Mask, VL);
   SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
-                             DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+                             DAG.getConstant(0, DL, XLenVT));
   return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType());
 }
 
@@ -4497,9 +4545,12 @@ getRVVFPReductionOpAndOperands(SDValue Op, SelectionDAG &DAG, EVT EltVT) {
   switch (Opcode) {
   default:
     llvm_unreachable("Unhandled reduction");
-  case ISD::VECREDUCE_FADD:
-    return std::make_tuple(RISCVISD::VECREDUCE_FADD_VL, Op.getOperand(0),
-                           DAG.getNeutralElement(BaseOpcode, DL, EltVT, Flags));
+  case ISD::VECREDUCE_FADD: {
+    // Use positive zero if we can. It is cheaper to materialize.
+    SDValue Zero =
+        DAG.getConstantFP(Flags.hasNoSignedZeros() ? 0.0 : -0.0, DL, EltVT);
+    return std::make_tuple(RISCVISD::VECREDUCE_FADD_VL, Op.getOperand(0), Zero);
+  }
   case ISD::VECREDUCE_SEQ_FADD:
     return std::make_tuple(RISCVISD::VECREDUCE_SEQ_FADD_VL, Op.getOperand(1),
                            Op.getOperand(0));
@@ -4530,17 +4581,17 @@ SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op,
   }
 
   MVT M1VT = getLMUL1VT(VectorVal.getSimpleValueType());
+  MVT XLenVT = Subtarget.getXLenVT();
 
   SDValue Mask, VL;
   std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
 
-  // FIXME: This is a VLMAX splat which might be too large and can prevent
-  // vsetvli removal.
-  SDValue ScalarSplat = DAG.getSplatVector(M1VT, DL, ScalarVal);
+  SDValue ScalarSplat = lowerScalarSplat(
+      ScalarVal, DAG.getConstant(1, DL, XLenVT), M1VT, DL, DAG, Subtarget);
   SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT),
                                   VectorVal, ScalarSplat, Mask, VL);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
-                     DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+                     DAG.getConstant(0, DL, XLenVT));
 }
 
 static unsigned getRVVVPReductionOp(unsigned ISDOpcode) {
@@ -4602,13 +4653,13 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
   MVT XLenVT = Subtarget.getXLenVT();
   MVT ResVT = !VecVT.isInteger() || VecEltVT.bitsGE(XLenVT) ? VecEltVT : XLenVT;
 
-  // FIXME: This is a VLMAX splat which might be too large and can prevent
-  // vsetvli removal.
-  SDValue StartSplat = DAG.getSplatVector(M1VT, DL, Op.getOperand(0));
+  SDValue StartSplat =
+      lowerScalarSplat(Op.getOperand(0), DAG.getConstant(1, DL, XLenVT), M1VT,
+                       DL, DAG, Subtarget);
   SDValue Reduction =
       DAG.getNode(RVVOpcode, DL, M1VT, StartSplat, Vec, StartSplat, Mask, VL);
   SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Reduction,
-                             DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+                             DAG.getConstant(0, DL, XLenVT));
   if (!VecVT.isInteger())
     return Elt0;
   return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType());
@@ -5365,6 +5416,33 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG,
   return convertFromScalableVector(VT, VPOp, DAG, Subtarget);
 }
 
+SDValue RISCVTargetLowering::lowerLogicVPOp(SDValue Op, SelectionDAG &DAG,
+                                            unsigned MaskOpc,
+                                            unsigned VecOpc) const {
+  MVT VT = Op.getSimpleValueType();
+  if (VT.getVectorElementType() != MVT::i1)
+    return lowerVPOp(Op, DAG, VecOpc);
+
+  // It is safe to drop mask parameter as masked-off elements are undef.
+  SDValue Op1 = Op->getOperand(0);
+  SDValue Op2 = Op->getOperand(1);
+  SDValue VL = Op->getOperand(3);
+
+  MVT ContainerVT = VT;
+  const bool IsFixed = VT.isFixedLengthVector();
+  if (IsFixed) {
+    ContainerVT = getContainerForFixedLengthVector(VT);
+    Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget);
+    Op2 = convertToScalableVector(ContainerVT, Op2, DAG, Subtarget);
+  }
+
+  SDLoc DL(Op);
+  SDValue Val = DAG.getNode(MaskOpc, DL, ContainerVT, Op1, Op2, VL);
+  if (!IsFixed)
+    return Val;
+  return convertFromScalableVector(VT, Val, DAG, Subtarget);
+}
+
 // Custom lower MGATHER/VP_GATHER to a legalized form for RVV. It will then be
 // matched to a RVV indexed load. The RVV indexed load instructions only
 // support the "unsigned unscaled" addressing mode; indices are implicitly
@@ -5695,11 +5773,17 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Op0 = IsStrict ? N->getOperand(1) : N->getOperand(0);
     if (getTypeAction(*DAG.getContext(), Op0.getValueType()) !=
         TargetLowering::TypeSoftenFloat) {
-      // FIXME: Support strict FP.
-      if (IsStrict)
-        return;
       if (!isTypeLegal(Op0.getValueType()))
         return;
+      if (IsStrict) {
+        unsigned Opc = IsSigned ? RISCVISD::STRICT_FCVT_W_RTZ_RV64
+                                : RISCVISD::STRICT_FCVT_WU_RTZ_RV64;
+        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
+        SDValue Res = DAG.getNode(Opc, DL, VTs, N->getOperand(0), Op0);
+        Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+        Results.push_back(Res.getValue(1));
+        return;
+      }
       unsigned Opc =
           IsSigned ? RISCVISD::FCVT_W_RTZ_RV64 : RISCVISD::FCVT_WU_RTZ_RV64;
       SDValue Res = DAG.getNode(Opc, DL, MVT::i64, Op0);
@@ -7026,7 +7110,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (SimplifyDemandedLowBitsHelper(1, Log2_32(BitWidth)))
       return SDValue(N, 0);
 
-    return combineGREVI_GORCI(N, DCI.DAG);
+    return combineGREVI_GORCI(N, DAG);
   }
   case RISCVISD::GREVW:
   case RISCVISD::GORCW: {
@@ -7035,7 +7119,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
         SimplifyDemandedLowBitsHelper(1, 5))
       return SDValue(N, 0);
 
-    return combineGREVI_GORCI(N, DCI.DAG);
+    return combineGREVI_GORCI(N, DAG);
   }
   case RISCVISD::SHFL:
   case RISCVISD::UNSHFL: {
@@ -7120,11 +7204,23 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     // Fold (zero_extend (fp_to_uint X)) to prevent forming fcvt+zexti32 during
     // type legalization. This is safe because fp_to_uint produces poison if
     // it overflows.
-    if (N->getValueType(0) == MVT::i64 && Subtarget.is64Bit() &&
-        N->getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
-        isTypeLegal(N->getOperand(0).getOperand(0).getValueType()))
-      return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), MVT::i64,
-                         N->getOperand(0).getOperand(0));
+    if (N->getValueType(0) == MVT::i64 && Subtarget.is64Bit()) {
+      SDValue Src = N->getOperand(0);
+      if (Src.getOpcode() == ISD::FP_TO_UINT &&
+          isTypeLegal(Src.getOperand(0).getValueType()))
+        return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), MVT::i64,
+                           Src.getOperand(0));
+      if (Src.getOpcode() == ISD::STRICT_FP_TO_UINT && Src.hasOneUse() &&
+          isTypeLegal(Src.getOperand(1).getValueType())) {
+        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
+        SDValue Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, SDLoc(N), VTs,
+                                  Src.getOperand(0), Src.getOperand(1));
+        DCI.CombineTo(N, Res);
+        DAG.ReplaceAllUsesOfValueWith(Src.getValue(1), Res.getValue(1));
+        DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+        return SDValue(N, 0); // Return N so it doesn't get rechecked.
+      }
+    }
     return SDValue();
   case RISCVISD::SELECT_CC: {
     // Transform
@@ -7685,6 +7781,8 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
   case RISCVISD::BDECOMPRESSW:
   case RISCVISD::FCVT_W_RTZ_RV64:
   case RISCVISD::FCVT_WU_RTZ_RV64:
+  case RISCVISD::STRICT_FCVT_W_RTZ_RV64:
+  case RISCVISD::STRICT_FCVT_WU_RTZ_RV64:
     // TODO: As the result is sign-extended, this is conservatively correct. A
     // more precise answer could be calculated for SRAW depending on known
     // bits in the shift amount.
@@ -8004,6 +8102,22 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   }
 }
 
+void RISCVTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                                        SDNode *Node) const {
+  // Add FRM dependency to any instructions with dynamic rounding mode.
+  unsigned Opc = MI.getOpcode();
+  auto Idx = RISCV::getNamedOperandIdx(Opc, RISCV::OpName::frm);
+  if (Idx < 0)
+    return;
+  if (MI.getOperand(Idx).getImm() != RISCVFPRndMode::DYN)
+    return;
+  // If the instruction already reads FRM, don't add another read.
+  if (MI.readsRegister(RISCV::FRM))
+    return;
+  MI.addOperand(
+      MachineOperand::CreateReg(RISCV::FRM, /*isDef*/ false, /*isImp*/ true));
+}
+
 // Calling Convention Implementation.
 // The expectations for frontend ABI lowering vary from target to target.
 // Ideally, an LLVM frontend would be able to avoid worrying about many ABI
@@ -9400,6 +9514,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FCVT_XU_RTZ)
   NODE_NAME_CASE(FCVT_W_RTZ_RV64)
   NODE_NAME_CASE(FCVT_WU_RTZ_RV64)
+  NODE_NAME_CASE(STRICT_FCVT_W_RTZ_RV64)
+  NODE_NAME_CASE(STRICT_FCVT_WU_RTZ_RV64)
   NODE_NAME_CASE(READ_CYCLE_WIDE)
   NODE_NAME_CASE(GREV)
   NODE_NAME_CASE(GREVW)
@@ -9541,6 +9657,9 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
+      // TODO: Support fixed vectors up to XLen for P extension?
+      if (VT.isVector())
+        break;
       return std::make_pair(0U, &RISCV::GPRRegClass);
     case 'f':
       if (Subtarget.hasStdExtZfh() && VT == MVT::f16)
@@ -9553,17 +9672,15 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     default:
       break;
     }
-  } else {
-    if (Constraint == "vr") {
-      for (const auto *RC : {&RISCV::VRRegClass, &RISCV::VRM2RegClass,
-                             &RISCV::VRM4RegClass, &RISCV::VRM8RegClass}) {
-        if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
-          return std::make_pair(0U, RC);
-      }
-    } else if (Constraint == "vm") {
-      if (TRI->isTypeLegalForClass(RISCV::VMRegClass, VT.SimpleTy))
-        return std::make_pair(0U, &RISCV::VMRegClass);
+  } else if (Constraint == "vr") {
+    for (const auto *RC : {&RISCV::VRRegClass, &RISCV::VRM2RegClass,
+                           &RISCV::VRM4RegClass, &RISCV::VRM8RegClass}) {
+      if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
+        return std::make_pair(0U, RC);
     }
+  } else if (Constraint == "vm") {
+    if (TRI->isTypeLegalForClass(RISCV::VMV0RegClass, VT.SimpleTy))
+      return std::make_pair(0U, &RISCV::VMV0RegClass);
   }
 
   // Clang will correctly decode the usage of register name aliases into their
@@ -10101,17 +10218,29 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts(
     unsigned ValueVTBitSize = ValueVT.getSizeInBits().getKnownMinSize();
     unsigned PartVTBitSize = PartVT.getSizeInBits().getKnownMinSize();
     if (PartVTBitSize % ValueVTBitSize == 0) {
+      assert(PartVTBitSize >= ValueVTBitSize);
       // If the element types are different, bitcast to the same element type of
       // PartVT first.
+      // Give an example here, we want copy a <vscale x 1 x i8> value to
+      // <vscale x 4 x i16>.
+      // We need to convert <vscale x 1 x i8> to <vscale x 8 x i8> by insert
+      // subvector, then we can bitcast to <vscale x 4 x i16>.
       if (ValueEltVT != PartEltVT) {
-        unsigned Count = ValueVTBitSize / PartEltVT.getSizeInBits();
-        assert(Count != 0 && "The number of element should not be zero.");
-        EVT SameEltTypeVT =
-            EVT::getVectorVT(Context, PartEltVT, Count, /*IsScalable=*/true);
-        Val = DAG.getNode(ISD::BITCAST, DL, SameEltTypeVT, Val);
+        if (PartVTBitSize > ValueVTBitSize) {
+          unsigned Count = PartVTBitSize / ValueEltVT.getFixedSizeInBits();
+          assert(Count != 0 && "The number of element should not be zero.");
+          EVT SameEltTypeVT =
+              EVT::getVectorVT(Context, ValueEltVT, Count, /*IsScalable=*/true);
+          Val = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SameEltTypeVT,
+                            DAG.getUNDEF(SameEltTypeVT), Val,
+                            DAG.getVectorIdxConstant(0, DL));
+        }
+        Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+      } else {
+        Val =
+            DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PartVT, DAG.getUNDEF(PartVT),
+                        Val, DAG.getVectorIdxConstant(0, DL));
       }
-      Val = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PartVT, DAG.getUNDEF(PartVT),
-                        Val, DAG.getConstant(0, DL, Subtarget.getXLenVT()));
       Parts[0] = Val;
       return true;
     }
@@ -10141,19 +10270,23 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue(
     unsigned ValueVTBitSize = ValueVT.getSizeInBits().getKnownMinSize();
     unsigned PartVTBitSize = PartVT.getSizeInBits().getKnownMinSize();
     if (PartVTBitSize % ValueVTBitSize == 0) {
+      assert(PartVTBitSize >= ValueVTBitSize);
       EVT SameEltTypeVT = ValueVT;
       // If the element types are different, convert it to the same element type
       // of PartVT.
+      // Give an example here, we want copy a <vscale x 1 x i8> value from
+      // <vscale x 4 x i16>.
+      // We need to convert <vscale x 4 x i16> to <vscale x 8 x i8> first,
+      // then we can extract <vscale x 1 x i8>.
       if (ValueEltVT != PartEltVT) {
-        unsigned Count = ValueVTBitSize / PartEltVT.getSizeInBits();
+        unsigned Count = PartVTBitSize / ValueEltVT.getFixedSizeInBits();
         assert(Count != 0 && "The number of element should not be zero.");
         SameEltTypeVT =
-            EVT::getVectorVT(Context, PartEltVT, Count, /*IsScalable=*/true);
+            EVT::getVectorVT(Context, ValueEltVT, Count, /*IsScalable=*/true);
+        Val = DAG.getNode(ISD::BITCAST, DL, SameEltTypeVT, Val);
       }
-      Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SameEltTypeVT, Val,
-                        DAG.getConstant(0, DL, Subtarget.getXLenVT()));
-      if (ValueEltVT != PartEltVT)
-        Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+      Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
+                        DAG.getVectorIdxConstant(0, DL));
       return Val;
     }
   }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 849928eb46ae..48c5ce730933 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -282,6 +282,11 @@ enum NodeType : unsigned {
   // the value read before the modification and the new chain pointer.
   SWAP_CSR,
 
+  // FP to 32 bit int conversions for RV64. These are used to keep track of the
+  // result being sign extended to 64 bit. These saturate out of range inputs.
+  STRICT_FCVT_W_RTZ_RV64 = ISD::FIRST_TARGET_STRICTFP_OPCODE,
+  STRICT_FCVT_WU_RTZ_RV64,
+
   // Memory opcodes start here.
   VLE_VL = ISD::FIRST_TARGET_MEMORY_OPCODE,
   VSE_VL,
@@ -315,7 +320,7 @@ public:
   bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
-  bool hasAndNot(SDValue Y) const override;
+  bool hasAndNotCompare(SDValue Y) const override;
   bool shouldSinkOperands(Instruction *I,
                           SmallVectorImpl<Use *> &Ops) const override;
   bool isFPImmLegal(const APFloat &Imm, EVT VT,
@@ -383,6 +388,9 @@ public:
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
 
+  void AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                     SDNode *Node) const override;
+
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
 
@@ -593,6 +601,8 @@ private:
   SDValue lowerToScalableOp(SDValue Op, SelectionDAG &DAG, unsigned NewOpc,
                             bool HasMask = true) const;
   SDValue lowerVPOp(SDValue Op, SelectionDAG &DAG, unsigned RISCVISDOpc) const;
+  SDValue lowerLogicVPOp(SDValue Op, SelectionDAG &DAG, unsigned MaskOpc,
+                         unsigned VecOpc) const;
   SDValue lowerFixedLengthVectorExtendToRVV(SDValue Op, SelectionDAG &DAG,
                                             unsigned ExtendOpc) const;
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index cfad4cdb9364..6a16b6354f95 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -107,31 +107,44 @@ def Vcompress    : RISCVVConstraint<!or(VS2Constraint.Value,
 
 // The following opcode names match those given in Table 19.1 in the
 // RISC-V User-level ISA specification ("RISC-V base opcode map").
-class RISCVOpcode<bits<7> val> {
+class RISCVOpcode<string name, bits<7> val> {
+  string Name = name;
   bits<7> Value = val;
 }
-def OPC_LOAD      : RISCVOpcode<0b0000011>;
-def OPC_LOAD_FP   : RISCVOpcode<0b0000111>;
-def OPC_MISC_MEM  : RISCVOpcode<0b0001111>;
-def OPC_OP_IMM    : RISCVOpcode<0b0010011>;
-def OPC_AUIPC     : RISCVOpcode<0b0010111>;
-def OPC_OP_IMM_32 : RISCVOpcode<0b0011011>;
-def OPC_STORE     : RISCVOpcode<0b0100011>;
-def OPC_STORE_FP  : RISCVOpcode<0b0100111>;
-def OPC_AMO       : RISCVOpcode<0b0101111>;
-def OPC_OP        : RISCVOpcode<0b0110011>;
-def OPC_LUI       : RISCVOpcode<0b0110111>;
-def OPC_OP_32     : RISCVOpcode<0b0111011>;
-def OPC_MADD      : RISCVOpcode<0b1000011>;
-def OPC_MSUB      : RISCVOpcode<0b1000111>;
-def OPC_NMSUB     : RISCVOpcode<0b1001011>;
-def OPC_NMADD     : RISCVOpcode<0b1001111>;
-def OPC_OP_FP     : RISCVOpcode<0b1010011>;
-def OPC_OP_V      : RISCVOpcode<0b1010111>;
-def OPC_BRANCH    : RISCVOpcode<0b1100011>;
-def OPC_JALR      : RISCVOpcode<0b1100111>;
-def OPC_JAL       : RISCVOpcode<0b1101111>;
-def OPC_SYSTEM    : RISCVOpcode<0b1110011>;
+def RISCVOpcodesList : GenericTable {
+  let FilterClass = "RISCVOpcode";
+  let Fields = [
+    "Name", "Value"
+  ];
+  let PrimaryKey = [ "Value" ];
+  let PrimaryKeyName = "lookupRISCVOpcodeByValue";
+}
+def lookupRISCVOpcodeByName : SearchIndex {
+  let Table = RISCVOpcodesList;
+  let Key = [ "Name" ];
+}
+def OPC_LOAD      : RISCVOpcode<"LOAD",      0b0000011>;
+def OPC_LOAD_FP   : RISCVOpcode<"LOAD_FP",   0b0000111>;
+def OPC_MISC_MEM  : RISCVOpcode<"MISC_MEM",  0b0001111>;
+def OPC_OP_IMM    : RISCVOpcode<"OP_IMM",    0b0010011>;
+def OPC_AUIPC     : RISCVOpcode<"AUIPC",     0b0010111>;
+def OPC_OP_IMM_32 : RISCVOpcode<"OP_IMM_32", 0b0011011>;
+def OPC_STORE     : RISCVOpcode<"STORE",     0b0100011>;
+def OPC_STORE_FP  : RISCVOpcode<"STORE_FP",  0b0100111>;
+def OPC_AMO       : RISCVOpcode<"AMO",       0b0101111>;
+def OPC_OP        : RISCVOpcode<"OP",        0b0110011>;
+def OPC_LUI       : RISCVOpcode<"LUI",       0b0110111>;
+def OPC_OP_32     : RISCVOpcode<"OP_32",     0b0111011>;
+def OPC_MADD      : RISCVOpcode<"MADD",      0b1000011>;
+def OPC_MSUB      : RISCVOpcode<"MSUB",      0b1000111>;
+def OPC_NMSUB     : RISCVOpcode<"NMSUB",     0b1001011>;
+def OPC_NMADD     : RISCVOpcode<"NMADD",     0b1001111>;
+def OPC_OP_FP     : RISCVOpcode<"OP_FP",     0b1010011>;
+def OPC_OP_V      : RISCVOpcode<"OP_V",      0b1010111>;
+def OPC_BRANCH    : RISCVOpcode<"BRANCH",    0b1100011>;
+def OPC_JALR      : RISCVOpcode<"JALR",      0b1100111>;
+def OPC_JAL       : RISCVOpcode<"JAL",       0b1101111>;
+def OPC_SYSTEM    : RISCVOpcode<"SYSTEM",    0b1110011>;
 
 class RVInst<dag outs, dag ins, string opcodestr, string argstr,
              list<dag> pattern, InstFormat format>
@@ -188,8 +201,7 @@ class RVInst<dag outs, dag ins, string opcodestr, string argstr,
 
 // Pseudo instructions
 class Pseudo<dag outs, dag ins, list<dag> pattern, string opcodestr = "", string argstr = "">
-    : RVInst<outs, ins, opcodestr, argstr, pattern, InstFormatPseudo>,
-      Sched<[]> {
+    : RVInst<outs, ins, opcodestr, argstr, pattern, InstFormatPseudo> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
@@ -265,14 +277,14 @@ class RVInstR4Frm<bits<2> funct2, RISCVOpcode opcode, dag outs, dag ins,
   bits<5> rs3;
   bits<5> rs2;
   bits<5> rs1;
-  bits<3> funct3;
+  bits<3> frm;
   bits<5> rd;
 
   let Inst{31-27} = rs3;
   let Inst{26-25} = funct2;
   let Inst{24-20} = rs2;
   let Inst{19-15} = rs1;
-  let Inst{14-12} = funct3;
+  let Inst{14-12} = frm;
   let Inst{11-7} = rd;
   let Opcode = opcode.Value;
 }
@@ -300,13 +312,13 @@ class RVInstRFrm<bits<7> funct7, RISCVOpcode opcode, dag outs, dag ins,
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
   bits<5> rs2;
   bits<5> rs1;
-  bits<3> funct3;
+  bits<3> frm;
   bits<5> rd;
 
   let Inst{31-25} = funct7;
   let Inst{24-20} = rs2;
   let Inst{19-15} = rs1;
-  let Inst{14-12} = funct3;
+  let Inst{14-12} = frm;
   let Inst{11-7} = rd;
   let Opcode = opcode.Value;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
index 80f46b73bfd7..69e9d3553b30 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
@@ -45,19 +45,6 @@ def SUMOPUnitStride  : RISCVLSUMOP<0b00000>;
 def SUMOPUnitStrideMask : RISCVLSUMOP<0b01011>;
 def SUMOPUnitStrideWholeReg : RISCVLSUMOP<0b01000>;
 
-class RISCVAMOOP<bits<5> val> {
-  bits<5> Value = val;
-}
-def AMOOPVamoSwap : RISCVAMOOP<0b00001>;
-def AMOOPVamoAdd : RISCVAMOOP<0b00000>;
-def AMOOPVamoXor : RISCVAMOOP<0b00100>;
-def AMOOPVamoAnd : RISCVAMOOP<0b01100>;
-def AMOOPVamoOr : RISCVAMOOP<0b01000>;
-def AMOOPVamoMin : RISCVAMOOP<0b10000>;
-def AMOOPVamoMax : RISCVAMOOP<0b10100>;
-def AMOOPVamoMinu : RISCVAMOOP<0b11000>;
-def AMOOPVamoMaxu : RISCVAMOOP<0b11100>;
-
 class RISCVWidth<bits<4> val> {
   bits<4> Value = val;
 }
@@ -342,22 +329,3 @@ class RVInstVSX<bits<3> nf, bit mew, RISCVMOP mop, bits<3> width,
 
   let Uses = [VTYPE, VL];
 }
-
-class RVInstVAMO<RISCVAMOOP amoop, bits<3> width, dag outs, 
-                 dag ins, string opcodestr, string argstr>
-    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
-  bits<5> vs2;
-  bits<5> rs1;
-  bit wd;
-  bit vm;
-
-  let Inst{31-27} = amoop.Value;
-  let Inst{26} = wd;
-  let Inst{25} = vm;
-  let Inst{24-20} = vs2;
-  let Inst{19-15} = rs1;
-  let Inst{14-12} = width;
-  let Opcode = OPC_AMO.Value;
-
-  let Uses = [VTYPE, VL];
-}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 547d82550cac..2e2e00886d57 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -35,6 +35,7 @@ using namespace llvm;
 #include "RISCVGenCompressInstEmitter.inc"
 
 #define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRINFO_NAMED_OPS
 #include "RISCVGenInstrInfo.inc"
 
 static cl::opt<bool> PreferWholeRegisterMove(
@@ -1059,6 +1060,7 @@ bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
     break;
   case RISCV::FSGNJ_D:
   case RISCV::FSGNJ_S:
+  case RISCV::FSGNJ_H:
     // The canonical floating-point move is fsgnj rd, rs, rs.
     return MI.getOperand(1).isReg() && MI.getOperand(2).isReg() &&
            MI.getOperand(1).getReg() == MI.getOperand(2).getReg();
@@ -1087,6 +1089,7 @@ RISCVInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
     break;
   case RISCV::FSGNJ_D:
   case RISCV::FSGNJ_S:
+  case RISCV::FSGNJ_H:
     // The canonical floating-point move is fsgnj rd, rs, rs.
     if (MI.getOperand(1).isReg() && MI.getOperand(2).isReg() &&
         MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
@@ -1254,7 +1257,7 @@ bool RISCVInstrInfo::isFunctionSafeToOutlineFrom(
 bool RISCVInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
                                             unsigned &Flags) const {
   // More accurate safety checking is done in getOutliningCandidateInfo.
-  return true;
+  return TargetInstrInfo::isMBBSafeToOutlineFrom(MBB, Flags);
 }
 
 // Enum values indicating how an outlined call should be constructed.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 2bfad7844c43..da0877c4299a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -18,6 +18,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 
 #define GET_INSTRINFO_HEADER
+#define GET_INSTRINFO_OPERAND_ENUM
 #include "RISCVGenInstrInfo.inc"
 
 namespace llvm {
@@ -181,6 +182,10 @@ protected:
 };
 
 namespace RISCV {
+
+// Implemented in RISCVGenInstrInfo.inc
+int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
+
 // Special immediate for AVL operand of V pseudo instructions to indicate VLMax.
 static constexpr int64_t VLMaxSentinel = -1LL;
 } // namespace RISCV
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 6f9cde966132..71eb6f01a4f4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -174,6 +174,20 @@ def uimm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isUInt<5>(Imm);}]> {
   let OperandNamespace = "RISCVOp";
 }
 
+def InsnDirectiveOpcode : AsmOperandClass {
+  let Name = "InsnDirectiveOpcode";
+  let ParserMethod = "parseInsnDirectiveOpcode";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isImm";
+}
+
+def uimm7_opcode : Operand<XLenVT> {
+  let ParserMatchClass = InsnDirectiveOpcode;
+  let DecoderMethod = "decodeUImmOperand<7>";
+  let OperandType = "OPERAND_UIMM7";
+  let OperandNamespace = "RISCVOp";
+}
+
 def uimm7 : Operand<XLenVT> {
   let ParserMatchClass = UImmAsmOperand<7>;
   let DecoderMethod = "decodeUImmOperand<7>";
@@ -878,35 +892,35 @@ def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF), 0>;
 // isCodeGenOnly = 1 to hide them from the tablegened assembly parser.
 let isCodeGenOnly = 1, hasSideEffects = 1, mayLoad = 1, mayStore = 1,
     hasNoSchedulingInfo = 1 in {
-def InsnR : DirectiveInsnR<(outs AnyReg:$rd), (ins uimm7:$opcode, uimm3:$funct3,
+def InsnR : DirectiveInsnR<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, uimm3:$funct3,
                                                    uimm7:$funct7, AnyReg:$rs1,
                                                    AnyReg:$rs2),
                            "$opcode, $funct3, $funct7, $rd, $rs1, $rs2">;
-def InsnR4 : DirectiveInsnR4<(outs AnyReg:$rd), (ins uimm7:$opcode,
+def InsnR4 : DirectiveInsnR4<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode,
                                                      uimm3:$funct3,
                                                      uimm2:$funct2,
                                                      AnyReg:$rs1, AnyReg:$rs2,
                                                      AnyReg:$rs3),
                             "$opcode, $funct3, $funct2, $rd, $rs1, $rs2, $rs3">;
-def InsnI : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7:$opcode, uimm3:$funct3,
+def InsnI : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, uimm3:$funct3,
                                                    AnyReg:$rs1, simm12:$imm12),
                            "$opcode, $funct3, $rd, $rs1, $imm12">;
-def InsnI_Mem : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7:$opcode,
+def InsnI_Mem : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode,
                                                        uimm3:$funct3,
                                                        AnyReg:$rs1,
                                                        simm12:$imm12),
                                "$opcode, $funct3, $rd, ${imm12}(${rs1})">;
-def InsnB : DirectiveInsnB<(outs), (ins uimm7:$opcode, uimm3:$funct3,
+def InsnB : DirectiveInsnB<(outs), (ins uimm7_opcode:$opcode, uimm3:$funct3,
                                         AnyReg:$rs1, AnyReg:$rs2,
                                         simm13_lsb0:$imm12),
                            "$opcode, $funct3, $rs1, $rs2, $imm12">;
-def InsnU : DirectiveInsnU<(outs AnyReg:$rd), (ins uimm7:$opcode,
+def InsnU : DirectiveInsnU<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode,
                                                    uimm20_lui:$imm20),
                            "$opcode, $rd, $imm20">;
-def InsnJ : DirectiveInsnJ<(outs AnyReg:$rd), (ins uimm7:$opcode,
+def InsnJ : DirectiveInsnJ<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode,
                                                    simm21_lsb0_jal:$imm20),
                            "$opcode, $rd, $imm20">;
-def InsnS : DirectiveInsnS<(outs), (ins uimm7:$opcode, uimm3:$funct3,
+def InsnS : DirectiveInsnS<(outs), (ins uimm7_opcode:$opcode, uimm3:$funct3,
                                         AnyReg:$rs2, AnyReg:$rs1,
                                         simm12:$imm12),
                            "$opcode, $funct3, $rs2, ${imm12}(${rs1})">;
@@ -918,37 +932,37 @@ def InsnS : DirectiveInsnS<(outs), (ins uimm7:$opcode, uimm3:$funct3,
 // for known formats.
 let EmitPriority = 0 in {
 def : InstAlias<".insn_r $opcode, $funct3, $funct7, $rd, $rs1, $rs2",
-                (InsnR AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, uimm7:$funct7,
+                (InsnR AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, uimm7:$funct7,
                        AnyReg:$rs1, AnyReg:$rs2)>;
 // Accept 4 register form of ".insn r" as alias for ".insn r4".
 def : InstAlias<".insn_r $opcode, $funct3, $funct2, $rd, $rs1, $rs2, $rs3",
-                (InsnR4 AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, uimm2:$funct2,
+                (InsnR4 AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, uimm2:$funct2,
                         AnyReg:$rs1, AnyReg:$rs2, AnyReg:$rs3)>;
 def : InstAlias<".insn_r4 $opcode, $funct3, $funct2, $rd, $rs1, $rs2, $rs3",
-                (InsnR4 AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, uimm2:$funct2,
+                (InsnR4 AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, uimm2:$funct2,
                         AnyReg:$rs1, AnyReg:$rs2, AnyReg:$rs3)>;
 def : InstAlias<".insn_i $opcode, $funct3, $rd, $rs1, $imm12",
-                (InsnI AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, AnyReg:$rs1,
+                (InsnI AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1,
                        simm12:$imm12)>;
 def : InstAlias<".insn_i $opcode, $funct3, $rd, ${imm12}(${rs1})",
-                (InsnI_Mem AnyReg:$rd, uimm7:$opcode, uimm3:$funct3,
+                (InsnI_Mem AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3,
                            AnyReg:$rs1, simm12:$imm12)>;
 def : InstAlias<".insn_b $opcode, $funct3, $rs1, $rs2, $imm12",
-                (InsnB uimm7:$opcode, uimm3:$funct3, AnyReg:$rs1,
+                (InsnB uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1,
                        AnyReg:$rs2, simm13_lsb0:$imm12)>;
 // Accept sb as an alias for b.
 def : InstAlias<".insn_sb $opcode, $funct3, $rs1, $rs2, $imm12",
-                (InsnB uimm7:$opcode, uimm3:$funct3, AnyReg:$rs1,
+                (InsnB uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1,
                        AnyReg:$rs2, simm13_lsb0:$imm12)>;
 def : InstAlias<".insn_u $opcode, $rd, $imm20",
-                (InsnU AnyReg:$rd, uimm7:$opcode, uimm20_lui:$imm20)>;
+                (InsnU AnyReg:$rd, uimm7_opcode:$opcode, uimm20_lui:$imm20)>;
 def : InstAlias<".insn_j $opcode, $rd, $imm20",
-                (InsnJ AnyReg:$rd, uimm7:$opcode, simm21_lsb0_jal:$imm20)>;
+                (InsnJ AnyReg:$rd, uimm7_opcode:$opcode, simm21_lsb0_jal:$imm20)>;
 // Accept uj as an alias for j.
 def : InstAlias<".insn_uj $opcode, $rd, $imm20",
-                (InsnJ AnyReg:$rd, uimm7:$opcode, simm21_lsb0_jal:$imm20)>;
+                (InsnJ AnyReg:$rd, uimm7_opcode:$opcode, simm21_lsb0_jal:$imm20)>;
 def : InstAlias<".insn_s $opcode, $funct3, $rs2, ${imm12}(${rs1})",
-                (InsnS uimm7:$opcode, uimm3:$funct3, AnyReg:$rs2,
+                (InsnS uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs2,
                        AnyReg:$rs1, simm12:$imm12)>;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 2cd011a02345..d6c31c4804db 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -26,41 +26,6 @@ def RISCVBuildPairF64 : SDNode<"RISCVISD::BuildPairF64", SDT_RISCVBuildPairF64>;
 def RISCVSplitF64     : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>;
 
 //===----------------------------------------------------------------------===//
-// Instruction Class Templates
-//===----------------------------------------------------------------------===//
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPFMAD_rrr_frm<RISCVOpcode opcode, string opcodestr>
-    : RVInstR4Frm<0b01, opcode, (outs FPR64:$rd),
-                  (ins FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, frmarg:$funct3),
-                  opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">;
-
-class FPFMADDynFrmAlias<FPFMAD_rrr_frm Inst, string OpcodeStr>
-    : InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3",
-                (Inst FPR64:$rd, FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>;
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPALUD_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
-    : RVInstR<funct7, funct3, OPC_OP_FP, (outs FPR64:$rd),
-              (ins FPR64:$rs1, FPR64:$rs2), opcodestr, "$rd, $rs1, $rs2">;
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPALUD_rr_frm<bits<7> funct7, string opcodestr>
-    : RVInstRFrm<funct7, OPC_OP_FP, (outs FPR64:$rd),
-                (ins FPR64:$rs1, FPR64:$rs2, frmarg:$funct3), opcodestr,
-                 "$rd, $rs1, $rs2, $funct3">;
-
-class FPALUDDynFrmAlias<FPALUD_rr_frm Inst, string OpcodeStr>
-    : InstAlias<OpcodeStr#" $rd, $rs1, $rs2",
-                (Inst FPR64:$rd, FPR64:$rs1, FPR64:$rs2, 0b111)>;
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPCmpD_rr<bits<3> funct3, string opcodestr>
-    : RVInstR<0b1010001, funct3, OPC_OP_FP, (outs GPR:$rd),
-              (ins FPR64:$rs1, FPR64:$rs2), opcodestr, "$rd, $rs1, $rs2">,
-      Sched<[WriteFCmp64, ReadFCmp64, ReadFCmp64]>;
-
-//===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
 
@@ -81,126 +46,104 @@ def FSD : RVInstS<0b011, OPC_STORE_FP, (outs),
                    "fsd", "$rs2, ${imm12}(${rs1})">,
           Sched<[WriteFST64, ReadStoreData, ReadFMemBase]>;
 
-def FMADD_D  : FPFMAD_rrr_frm<OPC_MADD, "fmadd.d">,
-               Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>;
-def          : FPFMADDynFrmAlias<FMADD_D, "fmadd.d">;
-def FMSUB_D  : FPFMAD_rrr_frm<OPC_MSUB, "fmsub.d">,
-               Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>;
-def          : FPFMADDynFrmAlias<FMSUB_D, "fmsub.d">;
-def FNMSUB_D : FPFMAD_rrr_frm<OPC_NMSUB, "fnmsub.d">,
-               Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>;
-def          : FPFMADDynFrmAlias<FNMSUB_D, "fnmsub.d">;
-def FNMADD_D : FPFMAD_rrr_frm<OPC_NMADD, "fnmadd.d">,
-               Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>;
-def          : FPFMADDynFrmAlias<FNMADD_D, "fnmadd.d">;
+let SchedRW = [WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64] in {
+def FMADD_D  : FPFMA_rrr_frm<OPC_MADD,  0b01, "fmadd.d",  FPR64>;
+def FMSUB_D  : FPFMA_rrr_frm<OPC_MSUB,  0b01, "fmsub.d",  FPR64>;
+def FNMSUB_D : FPFMA_rrr_frm<OPC_NMSUB, 0b01, "fnmsub.d", FPR64>;
+def FNMADD_D : FPFMA_rrr_frm<OPC_NMADD, 0b01, "fnmadd.d", FPR64>;
+}
+
+def : FPFMADynFrmAlias<FMADD_D,  "fmadd.d",  FPR64>;
+def : FPFMADynFrmAlias<FMSUB_D,  "fmsub.d",  FPR64>;
+def : FPFMADynFrmAlias<FNMSUB_D, "fnmsub.d", FPR64>;
+def : FPFMADynFrmAlias<FNMADD_D, "fnmadd.d", FPR64>;
 
-def FADD_D : FPALUD_rr_frm<0b0000001, "fadd.d">,
+def FADD_D : FPALU_rr_frm<0b0000001, "fadd.d", FPR64>,
              Sched<[WriteFALU64, ReadFALU64, ReadFALU64]>;
-def        : FPALUDDynFrmAlias<FADD_D, "fadd.d">;
-def FSUB_D : FPALUD_rr_frm<0b0000101, "fsub.d">,
+def FSUB_D : FPALU_rr_frm<0b0000101, "fsub.d", FPR64>,
              Sched<[WriteFALU64, ReadFALU64, ReadFALU64]>;
-def        : FPALUDDynFrmAlias<FSUB_D, "fsub.d">;
-def FMUL_D : FPALUD_rr_frm<0b0001001, "fmul.d">,
+def FMUL_D : FPALU_rr_frm<0b0001001, "fmul.d", FPR64>,
              Sched<[WriteFMul64, ReadFMul64, ReadFMul64]>;
-def        : FPALUDDynFrmAlias<FMUL_D, "fmul.d">;
-def FDIV_D : FPALUD_rr_frm<0b0001101, "fdiv.d">,
+def FDIV_D : FPALU_rr_frm<0b0001101, "fdiv.d", FPR64>,
              Sched<[WriteFDiv64, ReadFDiv64, ReadFDiv64]>;
-def        : FPALUDDynFrmAlias<FDIV_D, "fdiv.d">;
 
-def FSQRT_D : FPUnaryOp_r_frm<0b0101101, FPR64, FPR64, "fsqrt.d">,
-              Sched<[WriteFSqrt64, ReadFSqrt64]> {
-  let rs2 = 0b00000;
-}
-def         : FPUnaryOpDynFrmAlias<FSQRT_D, "fsqrt.d", FPR64, FPR64>;
+def        : FPALUDynFrmAlias<FADD_D, "fadd.d", FPR64>;
+def        : FPALUDynFrmAlias<FSUB_D, "fsub.d", FPR64>;
+def        : FPALUDynFrmAlias<FMUL_D, "fmul.d", FPR64>;
+def        : FPALUDynFrmAlias<FDIV_D, "fdiv.d", FPR64>;
 
-def FSGNJ_D  : FPALUD_rr<0b0010001, 0b000, "fsgnj.d">,
-               Sched<[WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64]>;
-def FSGNJN_D : FPALUD_rr<0b0010001, 0b001, "fsgnjn.d">,
-               Sched<[WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64]>;
-def FSGNJX_D : FPALUD_rr<0b0010001, 0b010, "fsgnjx.d">,
-               Sched<[WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64]>;
-def FMIN_D   : FPALUD_rr<0b0010101, 0b000, "fmin.d">,
-               Sched<[WriteFMinMax64, ReadFMinMax64, ReadFMinMax64]>;
-def FMAX_D   : FPALUD_rr<0b0010101, 0b001, "fmax.d">,
-               Sched<[WriteFMinMax64, ReadFMinMax64, ReadFMinMax64]>;
+def FSQRT_D : FPUnaryOp_r_frm<0b0101101, 0b00000, FPR64, FPR64, "fsqrt.d">,
+              Sched<[WriteFSqrt64, ReadFSqrt64]>;
+def         : FPUnaryOpDynFrmAlias<FSQRT_D, "fsqrt.d", FPR64, FPR64>;
 
-def FCVT_S_D : FPUnaryOp_r_frm<0b0100000, FPR32, FPR64, "fcvt.s.d">,
-               Sched<[WriteFCvtF64ToF32, ReadFCvtF64ToF32]> {
-  let rs2 = 0b00001;
+let SchedRW = [WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64],
+    mayRaiseFPException = 0 in {
+def FSGNJ_D  : FPALU_rr<0b0010001, 0b000, "fsgnj.d", FPR64>;
+def FSGNJN_D : FPALU_rr<0b0010001, 0b001, "fsgnjn.d", FPR64>;
+def FSGNJX_D : FPALU_rr<0b0010001, 0b010, "fsgnjx.d", FPR64>;
 }
-def          : FPUnaryOpDynFrmAlias<FCVT_S_D, "fcvt.s.d", FPR32, FPR64>;
 
-def FCVT_D_S : FPUnaryOp_r<0b0100001, 0b000, FPR64, FPR32, "fcvt.d.s">,
-               Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]> {
-  let rs2 = 0b00000;
+let SchedRW = [WriteFMinMax64, ReadFMinMax64, ReadFMinMax64] in {
+def FMIN_D   : FPALU_rr<0b0010101, 0b000, "fmin.d", FPR64>;
+def FMAX_D   : FPALU_rr<0b0010101, 0b001, "fmax.d", FPR64>;
 }
 
-def FEQ_D : FPCmpD_rr<0b010, "feq.d">;
-def FLT_D : FPCmpD_rr<0b001, "flt.d">;
-def FLE_D : FPCmpD_rr<0b000, "fle.d">;
+def FCVT_S_D : FPUnaryOp_r_frm<0b0100000, 0b00001, FPR32, FPR64, "fcvt.s.d">,
+               Sched<[WriteFCvtF64ToF32, ReadFCvtF64ToF32]>;
+def          : FPUnaryOpDynFrmAlias<FCVT_S_D, "fcvt.s.d", FPR32, FPR64>;
+
+def FCVT_D_S : FPUnaryOp_r<0b0100001, 0b00000, 0b000, FPR64, FPR32, "fcvt.d.s">,
+               Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]>;
 
-def FCLASS_D : FPUnaryOp_r<0b1110001, 0b001, GPR, FPR64, "fclass.d">,
-               Sched<[WriteFClass64, ReadFClass64]> {
-  let rs2 = 0b00000;
+let SchedRW = [WriteFCmp64, ReadFCmp64, ReadFCmp64] in {
+def FEQ_D : FPCmp_rr<0b1010001, 0b010, "feq.d", FPR64>;
+def FLT_D : FPCmp_rr<0b1010001, 0b001, "flt.d", FPR64>;
+def FLE_D : FPCmp_rr<0b1010001, 0b000, "fle.d", FPR64>;
 }
 
-def FCVT_W_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.w.d">,
-               Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]> {
-  let rs2 = 0b00000;
-}
+let mayRaiseFPException = 0 in
+def FCLASS_D : FPUnaryOp_r<0b1110001, 0b00000, 0b001, GPR, FPR64, "fclass.d">,
+               Sched<[WriteFClass64, ReadFClass64]>;
+
+def FCVT_W_D : FPUnaryOp_r_frm<0b1100001, 0b00000, GPR, FPR64, "fcvt.w.d">,
+               Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]>;
 def          : FPUnaryOpDynFrmAlias<FCVT_W_D, "fcvt.w.d", GPR, FPR64>;
 
-def FCVT_WU_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.wu.d">,
-                Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]> {
-  let rs2 = 0b00001;
-}
+def FCVT_WU_D : FPUnaryOp_r_frm<0b1100001, 0b00001, GPR, FPR64, "fcvt.wu.d">,
+                Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]>;
 def           : FPUnaryOpDynFrmAlias<FCVT_WU_D, "fcvt.wu.d", GPR, FPR64>;
 
-def FCVT_D_W : FPUnaryOp_r<0b1101001, 0b000, FPR64, GPR, "fcvt.d.w">,
-               Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]> {
-  let rs2 = 0b00000;
-}
+def FCVT_D_W : FPUnaryOp_r<0b1101001, 0b00000, 0b000, FPR64, GPR, "fcvt.d.w">,
+               Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]>;
 
-def FCVT_D_WU : FPUnaryOp_r<0b1101001, 0b000, FPR64, GPR, "fcvt.d.wu">,
-                Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]> {
-  let rs2 = 0b00001;
-}
+def FCVT_D_WU : FPUnaryOp_r<0b1101001, 0b00001, 0b000, FPR64, GPR, "fcvt.d.wu">,
+                Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]>;
 } // Predicates = [HasStdExtD]
 
 let Predicates = [HasStdExtD, IsRV64] in {
-def FCVT_L_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.l.d">,
-               Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]> {
-  let rs2 = 0b00010;
-}
+def FCVT_L_D : FPUnaryOp_r_frm<0b1100001, 0b00010, GPR, FPR64, "fcvt.l.d">,
+               Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]>;
 def          : FPUnaryOpDynFrmAlias<FCVT_L_D, "fcvt.l.d", GPR, FPR64>;
 
-def FCVT_LU_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.lu.d">,
-                Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]> {
-  let rs2 = 0b00011;
-}
+def FCVT_LU_D : FPUnaryOp_r_frm<0b1100001, 0b00011, GPR, FPR64, "fcvt.lu.d">,
+                Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]>;
 def           : FPUnaryOpDynFrmAlias<FCVT_LU_D, "fcvt.lu.d", GPR, FPR64>;
 
-def FMV_X_D : FPUnaryOp_r<0b1110001, 0b000, GPR, FPR64, "fmv.x.d">,
-              Sched<[WriteFMovF64ToI64, ReadFMovF64ToI64]> {
-  let rs2 = 0b00000;
-}
+let mayRaiseFPException = 0 in
+def FMV_X_D : FPUnaryOp_r<0b1110001, 0b00000, 0b000, GPR, FPR64, "fmv.x.d">,
+              Sched<[WriteFMovF64ToI64, ReadFMovF64ToI64]>;
 
-def FCVT_D_L : FPUnaryOp_r_frm<0b1101001, FPR64, GPR, "fcvt.d.l">,
-               Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]> {
-  let rs2 = 0b00010;
-}
+def FCVT_D_L : FPUnaryOp_r_frm<0b1101001, 0b00010, FPR64, GPR, "fcvt.d.l">,
+               Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]>;
 def          : FPUnaryOpDynFrmAlias<FCVT_D_L, "fcvt.d.l", FPR64, GPR>;
 
-def FCVT_D_LU : FPUnaryOp_r_frm<0b1101001, FPR64, GPR, "fcvt.d.lu">,
-                Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]> {
-  let rs2 = 0b00011;
-}
+def FCVT_D_LU : FPUnaryOp_r_frm<0b1101001, 0b00011, FPR64, GPR, "fcvt.d.lu">,
+                Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]>;
 def           : FPUnaryOpDynFrmAlias<FCVT_D_LU, "fcvt.d.lu", FPR64, GPR>;
 
-def FMV_D_X : FPUnaryOp_r<0b1111001, 0b000, FPR64, GPR, "fmv.d.x">,
-              Sched<[WriteFMovI64ToF64, ReadFMovI64ToF64]> {
-  let rs2 = 0b00000;
-}
+let mayRaiseFPException = 0 in
+def FMV_D_X : FPUnaryOp_r<0b1111001, 0b00000, 0b000, FPR64, GPR, "fmv.d.x">,
+              Sched<[WriteFMovI64ToF64, ReadFMovI64ToF64]>;
 } // Predicates = [HasStdExtD, IsRV64]
 
 //===----------------------------------------------------------------------===//
@@ -241,20 +184,20 @@ let Predicates = [HasStdExtD] in {
 /// Float conversion operations
 
 // f64 -> f32, f32 -> f64
-def : Pat<(fpround FPR64:$rs1), (FCVT_S_D FPR64:$rs1, 0b111)>;
-def : Pat<(fpextend FPR32:$rs1), (FCVT_D_S FPR32:$rs1)>;
+def : Pat<(any_fpround FPR64:$rs1), (FCVT_S_D FPR64:$rs1, 0b111)>;
+def : Pat<(any_fpextend FPR32:$rs1), (FCVT_D_S FPR32:$rs1)>;
 
 // [u]int<->double conversion patterns must be gated on IsRV32 or IsRV64, so
 // are defined later.
 
 /// Float arithmetic operations
 
-def : PatFpr64Fpr64DynFrm<fadd, FADD_D>;
-def : PatFpr64Fpr64DynFrm<fsub, FSUB_D>;
-def : PatFpr64Fpr64DynFrm<fmul, FMUL_D>;
-def : PatFpr64Fpr64DynFrm<fdiv, FDIV_D>;
+def : PatFpr64Fpr64DynFrm<any_fadd, FADD_D>;
+def : PatFpr64Fpr64DynFrm<any_fsub, FSUB_D>;
+def : PatFpr64Fpr64DynFrm<any_fmul, FMUL_D>;
+def : PatFpr64Fpr64DynFrm<any_fdiv, FDIV_D>;
 
-def : Pat<(fsqrt FPR64:$rs1), (FSQRT_D FPR64:$rs1, 0b111)>;
+def : Pat<(any_fsqrt FPR64:$rs1), (FSQRT_D FPR64:$rs1, 0b111)>;
 
 def : Pat<(fneg FPR64:$rs1), (FSGNJN_D $rs1, $rs1)>;
 def : Pat<(fabs FPR64:$rs1), (FSGNJX_D $rs1, $rs1)>;
@@ -266,19 +209,19 @@ def : Pat<(fcopysign FPR32:$rs1, FPR64:$rs2), (FSGNJ_S $rs1, (FCVT_S_D $rs2,
                                                               0b111))>;
 
 // fmadd: rs1 * rs2 + rs3
-def : Pat<(fma FPR64:$rs1, FPR64:$rs2, FPR64:$rs3),
+def : Pat<(any_fma FPR64:$rs1, FPR64:$rs2, FPR64:$rs3),
           (FMADD_D $rs1, $rs2, $rs3, 0b111)>;
 
 // fmsub: rs1 * rs2 - rs3
-def : Pat<(fma FPR64:$rs1, FPR64:$rs2, (fneg FPR64:$rs3)),
+def : Pat<(any_fma FPR64:$rs1, FPR64:$rs2, (fneg FPR64:$rs3)),
           (FMSUB_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>;
 
 // fnmsub: -rs1 * rs2 + rs3
-def : Pat<(fma (fneg FPR64:$rs1), FPR64:$rs2, FPR64:$rs3),
+def : Pat<(any_fma (fneg FPR64:$rs1), FPR64:$rs2, FPR64:$rs3),
           (FNMSUB_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>;
 
 // fnmadd: -rs1 * rs2 - rs3
-def : Pat<(fma (fneg FPR64:$rs1), FPR64:$rs2, (fneg FPR64:$rs3)),
+def : Pat<(any_fma (fneg FPR64:$rs1), FPR64:$rs2, (fneg FPR64:$rs3)),
           (FNMADD_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>;
 
 // The ratified 20191213 ISA spec defines fmin and fmax in a way that matches
@@ -328,8 +271,8 @@ let Predicates = [HasStdExtD, IsRV32] in {
 def : Pat<(f64 (fpimm0)), (FCVT_D_W (i32 X0))>;
 
 // double->[u]int. Round-to-zero must be used.
-def : Pat<(i32 (fp_to_sint FPR64:$rs1)), (FCVT_W_D FPR64:$rs1, 0b001)>;
-def : Pat<(i32 (fp_to_uint FPR64:$rs1)), (FCVT_WU_D FPR64:$rs1, 0b001)>;
+def : Pat<(i32 (any_fp_to_sint FPR64:$rs1)), (FCVT_W_D FPR64:$rs1, 0b001)>;
+def : Pat<(i32 (any_fp_to_uint FPR64:$rs1)), (FCVT_WU_D FPR64:$rs1, 0b001)>;
 
 // Saturating double->[u]int32.
 def : Pat<(i32 (riscv_fcvt_x_rtz FPR64:$rs1)), (FCVT_W_D $rs1, 0b001)>;
@@ -342,8 +285,8 @@ def : Pat<(i32 (lrint FPR64:$rs1)), (FCVT_W_D $rs1, 0b111)>;
 def : Pat<(i32 (lround FPR64:$rs1)), (FCVT_W_D $rs1, 0b100)>;
 
 // [u]int->double.
-def : Pat<(sint_to_fp (i32 GPR:$rs1)), (FCVT_D_W GPR:$rs1)>;
-def : Pat<(uint_to_fp (i32 GPR:$rs1)), (FCVT_D_WU GPR:$rs1)>;
+def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_D_W GPR:$rs1)>;
+def : Pat<(any_uint_to_fp (i32 GPR:$rs1)), (FCVT_D_WU GPR:$rs1)>;
 } // Predicates = [HasStdExtD, IsRV32]
 
 let Predicates = [HasStdExtD, IsRV64] in {
@@ -358,20 +301,20 @@ def : Pat<(i64 (bitconvert FPR64:$rs1)), (FMV_X_D FPR64:$rs1)>;
 // Use target specific isd nodes to help us remember the result is sign
 // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
 // duplicated if it has another user that didn't need the sign_extend.
-def : Pat<(riscv_fcvt_w_rtz_rv64 FPR64:$rs1),  (FCVT_W_D $rs1, 0b001)>;
-def : Pat<(riscv_fcvt_wu_rtz_rv64 FPR64:$rs1), (FCVT_WU_D $rs1, 0b001)>;
+def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR64:$rs1),  (FCVT_W_D $rs1, 0b001)>;
+def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR64:$rs1), (FCVT_WU_D $rs1, 0b001)>;
 
 // [u]int32->fp
-def : Pat<(sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_D_W $rs1)>;
-def : Pat<(uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_D_WU $rs1)>;
+def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_D_W $rs1)>;
+def : Pat<(any_uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_D_WU $rs1)>;
 
 // Saturating double->[u]int64.
 def : Pat<(i64 (riscv_fcvt_x_rtz FPR64:$rs1)), (FCVT_L_D $rs1, 0b001)>;
 def : Pat<(i64 (riscv_fcvt_xu_rtz FPR64:$rs1)), (FCVT_LU_D $rs1, 0b001)>;
 
 // double->[u]int64. Round-to-zero must be used.
-def : Pat<(i64 (fp_to_sint FPR64:$rs1)), (FCVT_L_D FPR64:$rs1, 0b001)>;
-def : Pat<(i64 (fp_to_uint FPR64:$rs1)), (FCVT_LU_D FPR64:$rs1, 0b001)>;
+def : Pat<(i64 (any_fp_to_sint FPR64:$rs1)), (FCVT_L_D FPR64:$rs1, 0b001)>;
+def : Pat<(i64 (any_fp_to_uint FPR64:$rs1)), (FCVT_LU_D FPR64:$rs1, 0b001)>;
 
 // double->int64 with current rounding mode.
 def : Pat<(i64 (lrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>;
@@ -382,6 +325,6 @@ def : Pat<(i64 (lround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>;
 def : Pat<(i64 (llround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>;
 
 // [u]int64->fp. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp (i64 GPR:$rs1)), (FCVT_D_L GPR:$rs1, 0b111)>;
-def : Pat<(uint_to_fp (i64 GPR:$rs1)), (FCVT_D_LU GPR:$rs1, 0b111)>;
+def : Pat<(any_sint_to_fp (i64 GPR:$rs1)), (FCVT_D_L GPR:$rs1, 0b111)>;
+def : Pat<(any_uint_to_fp (i64 GPR:$rs1)), (FCVT_D_LU GPR:$rs1, 0b111)>;
 } // Predicates = [HasStdExtD, IsRV64]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 3400c3be52bf..bb45ed859442 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -19,9 +19,9 @@ def SDT_RISCVFMV_W_X_RV64
     : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i64>]>;
 def SDT_RISCVFMV_X_ANYEXTW_RV64
     : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>;
-def STD_RISCVFCVT_W_RV64
+def SDT_RISCVFCVT_W_RV64
     : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisFP<1>]>;
-def STD_RISCVFCVT_X
+def SDT_RISCVFCVT_X
     : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisFP<1>]>;
 
 def riscv_fmv_w_x_rv64
@@ -29,13 +29,27 @@ def riscv_fmv_w_x_rv64
 def riscv_fmv_x_anyextw_rv64
     : SDNode<"RISCVISD::FMV_X_ANYEXTW_RV64", SDT_RISCVFMV_X_ANYEXTW_RV64>;
 def riscv_fcvt_w_rtz_rv64
-    : SDNode<"RISCVISD::FCVT_W_RTZ_RV64", STD_RISCVFCVT_W_RV64>;
+    : SDNode<"RISCVISD::FCVT_W_RTZ_RV64", SDT_RISCVFCVT_W_RV64>;
 def riscv_fcvt_wu_rtz_rv64
-    : SDNode<"RISCVISD::FCVT_WU_RTZ_RV64", STD_RISCVFCVT_W_RV64>;
+    : SDNode<"RISCVISD::FCVT_WU_RTZ_RV64", SDT_RISCVFCVT_W_RV64>;
 def riscv_fcvt_x_rtz
-    : SDNode<"RISCVISD::FCVT_X_RTZ", STD_RISCVFCVT_X>;
+    : SDNode<"RISCVISD::FCVT_X_RTZ", SDT_RISCVFCVT_X>;
 def riscv_fcvt_xu_rtz
-    : SDNode<"RISCVISD::FCVT_XU_RTZ", STD_RISCVFCVT_X>;
+    : SDNode<"RISCVISD::FCVT_XU_RTZ", SDT_RISCVFCVT_X>;
+
+def riscv_strict_fcvt_w_rtz_rv64
+    : SDNode<"RISCVISD::STRICT_FCVT_W_RTZ_RV64", SDT_RISCVFCVT_W_RV64,
+             [SDNPHasChain]>;
+def riscv_strict_fcvt_wu_rtz_rv64
+    : SDNode<"RISCVISD::STRICT_FCVT_WU_RTZ_RV64", SDT_RISCVFCVT_W_RV64,
+             [SDNPHasChain]>;
+
+def riscv_any_fcvt_w_rtz_rv64 : PatFrags<(ops node:$src),
+                                         [(riscv_strict_fcvt_w_rtz_rv64 node:$src),
+                                          (riscv_fcvt_w_rtz_rv64 node:$src)]>;
+def riscv_any_fcvt_wu_rtz_rv64 : PatFrags<(ops node:$src),
+                                          [(riscv_strict_fcvt_wu_rtz_rv64 node:$src),
+                                           (riscv_fcvt_wu_rtz_rv64 node:$src)]>;
 
 //===----------------------------------------------------------------------===//
 // Operand and SDNode transformation definitions.
@@ -59,54 +73,65 @@ def frmarg : Operand<XLenVT> {
 // Instruction class templates
 //===----------------------------------------------------------------------===//
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPFMAS_rrr_frm<RISCVOpcode opcode, string opcodestr>
-    : RVInstR4Frm<0b00, opcode, (outs FPR32:$rd),
-                  (ins FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, frmarg:$funct3),
-                  opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">;
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
+    UseNamedOperandTable = 1, hasPostISelHook = 1 in
+class FPFMA_rrr_frm<RISCVOpcode opcode, bits<2> funct2, string opcodestr,
+                    RegisterClass rty>
+    : RVInstR4Frm<funct2, opcode, (outs rty:$rd),
+                  (ins rty:$rs1, rty:$rs2, rty:$rs3, frmarg:$frm),
+                  opcodestr, "$rd, $rs1, $rs2, $rs3, $frm">;
 
-class FPFMASDynFrmAlias<FPFMAS_rrr_frm Inst, string OpcodeStr>
+class FPFMADynFrmAlias<FPFMA_rrr_frm Inst, string OpcodeStr,
+                       RegisterClass rty>
     : InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3",
-                (Inst FPR32:$rd, FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>;
+                (Inst rty:$rd, rty:$rs1, rty:$rs2, rty:$rs3, 0b111)>;
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPALUS_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
-    : RVInstR<funct7, funct3, OPC_OP_FP, (outs FPR32:$rd),
-              (ins FPR32:$rs1, FPR32:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
+class FPALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
+               RegisterClass rty>
+    : RVInstR<funct7, funct3, OPC_OP_FP, (outs rty:$rd),
+              (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">;
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPALUS_rr_frm<bits<7> funct7, string opcodestr>
-    : RVInstRFrm<funct7, OPC_OP_FP, (outs FPR32:$rd),
-                 (ins FPR32:$rs1, FPR32:$rs2, frmarg:$funct3), opcodestr,
-                  "$rd, $rs1, $rs2, $funct3">;
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
+    UseNamedOperandTable = 1, hasPostISelHook = 1 in
+class FPALU_rr_frm<bits<7> funct7, string opcodestr, RegisterClass rty>
+    : RVInstRFrm<funct7, OPC_OP_FP, (outs rty:$rd),
+                 (ins rty:$rs1, rty:$rs2, frmarg:$frm), opcodestr,
+                  "$rd, $rs1, $rs2, $frm">;
 
-class FPALUSDynFrmAlias<FPALUS_rr_frm Inst, string OpcodeStr>
+class FPALUDynFrmAlias<FPALU_rr_frm Inst, string OpcodeStr,
+                       RegisterClass rty>
     : InstAlias<OpcodeStr#" $rd, $rs1, $rs2",
-                (Inst FPR32:$rd, FPR32:$rs1, FPR32:$rs2, 0b111)>;
+                (Inst rty:$rd, rty:$rs1, rty:$rs2, 0b111)>;
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPUnaryOp_r<bits<7> funct7, bits<3> funct3, RegisterClass rdty,
-                RegisterClass rs1ty, string opcodestr>
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
+class FPUnaryOp_r<bits<7> funct7, bits<5> rs2val, bits<3> funct3,
+                  RegisterClass rdty, RegisterClass rs1ty, string opcodestr>
     : RVInstR<funct7, funct3, OPC_OP_FP, (outs rdty:$rd), (ins rs1ty:$rs1),
-              opcodestr, "$rd, $rs1">;
+              opcodestr, "$rd, $rs1"> {
+  let rs2 = rs2val;
+}
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPUnaryOp_r_frm<bits<7> funct7, RegisterClass rdty, RegisterClass rs1ty,
-                      string opcodestr>
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
+    UseNamedOperandTable = 1, hasPostISelHook = 1 in
+class FPUnaryOp_r_frm<bits<7> funct7, bits<5> rs2val, RegisterClass rdty,
+                      RegisterClass rs1ty, string opcodestr>
     : RVInstRFrm<funct7, OPC_OP_FP, (outs rdty:$rd),
-                 (ins rs1ty:$rs1, frmarg:$funct3), opcodestr,
-                  "$rd, $rs1, $funct3">;
+                 (ins rs1ty:$rs1, frmarg:$frm), opcodestr,
+                  "$rd, $rs1, $frm"> {
+  let rs2 = rs2val;
+}
 
 class FPUnaryOpDynFrmAlias<FPUnaryOp_r_frm Inst, string OpcodeStr,
                            RegisterClass rdty, RegisterClass rs1ty>
     : InstAlias<OpcodeStr#" $rd, $rs1",
                 (Inst rdty:$rd, rs1ty:$rs1, 0b111)>;
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPCmpS_rr<bits<3> funct3, string opcodestr>
-    : RVInstR<0b1010000, funct3, OPC_OP_FP, (outs GPR:$rd),
-              (ins FPR32:$rs1, FPR32:$rs2), opcodestr, "$rd, $rs1, $rs2">,
-      Sched<[WriteFCmp32, ReadFCmp32, ReadFCmp32]>;
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
+class FPCmp_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
+               RegisterClass rty>
+    : RVInstR<funct7, funct3, OPC_OP_FP, (outs GPR:$rd),
+              (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">;
 
 //===----------------------------------------------------------------------===//
 // Instructions
@@ -128,116 +153,98 @@ def FSW : RVInstS<0b010, OPC_STORE_FP, (outs),
                    "fsw", "$rs2, ${imm12}(${rs1})">,
           Sched<[WriteFST32, ReadStoreData, ReadFMemBase]>;
 
-def FMADD_S  : FPFMAS_rrr_frm<OPC_MADD, "fmadd.s">,
-               Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>;
-def          : FPFMASDynFrmAlias<FMADD_S, "fmadd.s">;
-def FMSUB_S  : FPFMAS_rrr_frm<OPC_MSUB, "fmsub.s">,
-               Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>;
-def          : FPFMASDynFrmAlias<FMSUB_S, "fmsub.s">;
-def FNMSUB_S : FPFMAS_rrr_frm<OPC_NMSUB, "fnmsub.s">,
-               Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>;
-def          : FPFMASDynFrmAlias<FNMSUB_S, "fnmsub.s">;
-def FNMADD_S : FPFMAS_rrr_frm<OPC_NMADD, "fnmadd.s">,
-               Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>;
-def          : FPFMASDynFrmAlias<FNMADD_S, "fnmadd.s">;
+let SchedRW = [WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32] in {
+def FMADD_S  : FPFMA_rrr_frm<OPC_MADD,  0b00, "fmadd.s",  FPR32>;
+def FMSUB_S  : FPFMA_rrr_frm<OPC_MSUB,  0b00, "fmsub.s",  FPR32>;
+def FNMSUB_S : FPFMA_rrr_frm<OPC_NMSUB, 0b00, "fnmsub.s", FPR32>;
+def FNMADD_S : FPFMA_rrr_frm<OPC_NMADD, 0b00, "fnmadd.s", FPR32>;
+}
+
+def : FPFMADynFrmAlias<FMADD_S,  "fmadd.s",  FPR32>;
+def : FPFMADynFrmAlias<FMSUB_S,  "fmsub.s",  FPR32>;
+def : FPFMADynFrmAlias<FNMSUB_S, "fnmsub.s", FPR32>;
+def : FPFMADynFrmAlias<FNMADD_S, "fnmadd.s", FPR32>;
 
-def FADD_S : FPALUS_rr_frm<0b0000000, "fadd.s">,
+def FADD_S : FPALU_rr_frm<0b0000000, "fadd.s", FPR32>,
              Sched<[WriteFALU32, ReadFALU32, ReadFALU32]>;
-def        : FPALUSDynFrmAlias<FADD_S, "fadd.s">;
-def FSUB_S : FPALUS_rr_frm<0b0000100, "fsub.s">,
+def FSUB_S : FPALU_rr_frm<0b0000100, "fsub.s", FPR32>,
              Sched<[WriteFALU32, ReadFALU32, ReadFALU32]>;
-def        : FPALUSDynFrmAlias<FSUB_S, "fsub.s">;
-def FMUL_S : FPALUS_rr_frm<0b0001000, "fmul.s">,
+def FMUL_S : FPALU_rr_frm<0b0001000, "fmul.s", FPR32>,
              Sched<[WriteFMul32, ReadFMul32, ReadFMul32]>;
-def        : FPALUSDynFrmAlias<FMUL_S, "fmul.s">;
-def FDIV_S : FPALUS_rr_frm<0b0001100, "fdiv.s">,
+def FDIV_S : FPALU_rr_frm<0b0001100, "fdiv.s", FPR32>,
              Sched<[WriteFDiv32, ReadFDiv32, ReadFDiv32]>;
-def        : FPALUSDynFrmAlias<FDIV_S, "fdiv.s">;
 
-def FSQRT_S : FPUnaryOp_r_frm<0b0101100, FPR32, FPR32, "fsqrt.s">,
-              Sched<[WriteFSqrt32, ReadFSqrt32]> {
-  let rs2 = 0b00000;
-}
+def        : FPALUDynFrmAlias<FADD_S, "fadd.s", FPR32>;
+def        : FPALUDynFrmAlias<FSUB_S, "fsub.s", FPR32>;
+def        : FPALUDynFrmAlias<FMUL_S, "fmul.s", FPR32>;
+def        : FPALUDynFrmAlias<FDIV_S, "fdiv.s", FPR32>;
+
+def FSQRT_S : FPUnaryOp_r_frm<0b0101100, 0b00000, FPR32, FPR32, "fsqrt.s">,
+              Sched<[WriteFSqrt32, ReadFSqrt32]>;
 def         : FPUnaryOpDynFrmAlias<FSQRT_S, "fsqrt.s", FPR32, FPR32>;
 
-def FSGNJ_S  : FPALUS_rr<0b0010000, 0b000, "fsgnj.s">,
-               Sched<[WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32]>;
-def FSGNJN_S : FPALUS_rr<0b0010000, 0b001, "fsgnjn.s">,
-               Sched<[WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32]>;
-def FSGNJX_S : FPALUS_rr<0b0010000, 0b010, "fsgnjx.s">,
-               Sched<[WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32]>;
-def FMIN_S   : FPALUS_rr<0b0010100, 0b000, "fmin.s">,
-               Sched<[WriteFMinMax32, ReadFMinMax32, ReadFMinMax32]>;
-def FMAX_S   : FPALUS_rr<0b0010100, 0b001, "fmax.s">,
-               Sched<[WriteFMinMax32, ReadFMinMax32, ReadFMinMax32]>;
+let SchedRW = [WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32],
+    mayRaiseFPException = 0 in {
+def FSGNJ_S  : FPALU_rr<0b0010000, 0b000, "fsgnj.s", FPR32>;
+def FSGNJN_S : FPALU_rr<0b0010000, 0b001, "fsgnjn.s", FPR32>;
+def FSGNJX_S : FPALU_rr<0b0010000, 0b010, "fsgnjx.s", FPR32>;
+}
 
-def FCVT_W_S : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.w.s">,
-               Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]> {
-  let rs2 = 0b00000;
+let SchedRW = [WriteFMinMax32, ReadFMinMax32, ReadFMinMax32] in {
+def FMIN_S   : FPALU_rr<0b0010100, 0b000, "fmin.s", FPR32>;
+def FMAX_S   : FPALU_rr<0b0010100, 0b001, "fmax.s", FPR32>;
 }
+
+def FCVT_W_S : FPUnaryOp_r_frm<0b1100000, 0b00000, GPR, FPR32, "fcvt.w.s">,
+               Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]>;
 def          : FPUnaryOpDynFrmAlias<FCVT_W_S, "fcvt.w.s", GPR, FPR32>;
 
-def FCVT_WU_S : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.wu.s">,
-                Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]> {
-  let rs2 = 0b00001;
-}
+def FCVT_WU_S : FPUnaryOp_r_frm<0b1100000, 0b00001, GPR, FPR32, "fcvt.wu.s">,
+                Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]>;
 def           : FPUnaryOpDynFrmAlias<FCVT_WU_S, "fcvt.wu.s", GPR, FPR32>;
 
-def FMV_X_W : FPUnaryOp_r<0b1110000, 0b000, GPR, FPR32, "fmv.x.w">,
-              Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]> {
-  let rs2 = 0b00000;
-}
-
-def FEQ_S : FPCmpS_rr<0b010, "feq.s">;
-def FLT_S : FPCmpS_rr<0b001, "flt.s">;
-def FLE_S : FPCmpS_rr<0b000, "fle.s">;
+let mayRaiseFPException = 0 in
+def FMV_X_W : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR32, "fmv.x.w">,
+              Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>;
 
-def FCLASS_S : FPUnaryOp_r<0b1110000, 0b001, GPR, FPR32, "fclass.s">,
-               Sched<[WriteFClass32, ReadFClass32]> {
-  let rs2 = 0b00000;
+let SchedRW = [WriteFCmp32, ReadFCmp32, ReadFCmp32] in {
+def FEQ_S : FPCmp_rr<0b1010000, 0b010, "feq.s", FPR32>;
+def FLT_S : FPCmp_rr<0b1010000, 0b001, "flt.s", FPR32>;
+def FLE_S : FPCmp_rr<0b1010000, 0b000, "fle.s", FPR32>;
 }
 
-def FCVT_S_W : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.w">,
-               Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]> {
-  let rs2 = 0b00000;
-}
+let mayRaiseFPException = 0 in
+def FCLASS_S : FPUnaryOp_r<0b1110000, 0b00000, 0b001, GPR, FPR32, "fclass.s">,
+               Sched<[WriteFClass32, ReadFClass32]>;
+
+def FCVT_S_W : FPUnaryOp_r_frm<0b1101000, 0b00000, FPR32, GPR, "fcvt.s.w">,
+               Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]>;
 def          : FPUnaryOpDynFrmAlias<FCVT_S_W, "fcvt.s.w", FPR32, GPR>;
 
-def FCVT_S_WU : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.wu">,
-                Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]> {
-  let rs2 = 0b00001;
-}
+def FCVT_S_WU : FPUnaryOp_r_frm<0b1101000, 0b00001, FPR32, GPR, "fcvt.s.wu">,
+                Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]>;
 def           : FPUnaryOpDynFrmAlias<FCVT_S_WU, "fcvt.s.wu", FPR32, GPR>;
 
-def FMV_W_X : FPUnaryOp_r<0b1111000, 0b000, FPR32, GPR, "fmv.w.x">,
-              Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]> {
-  let rs2 = 0b00000;
-}
+let mayRaiseFPException = 0 in
+def FMV_W_X : FPUnaryOp_r<0b1111000, 0b00000, 0b000, FPR32, GPR, "fmv.w.x">,
+              Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]>;
 } // Predicates = [HasStdExtF]
 
 let Predicates = [HasStdExtF, IsRV64] in {
-def FCVT_L_S  : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.l.s">,
-                Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]> {
-  let rs2 = 0b00010;
-}
+def FCVT_L_S  : FPUnaryOp_r_frm<0b1100000, 0b00010, GPR, FPR32, "fcvt.l.s">,
+                Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]>;
 def           : FPUnaryOpDynFrmAlias<FCVT_L_S, "fcvt.l.s", GPR, FPR32>;
 
-def FCVT_LU_S  : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.lu.s">,
-                 Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]> {
-  let rs2 = 0b00011;
-}
+def FCVT_LU_S  : FPUnaryOp_r_frm<0b1100000, 0b00011, GPR, FPR32, "fcvt.lu.s">,
+                 Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]>;
 def            : FPUnaryOpDynFrmAlias<FCVT_LU_S, "fcvt.lu.s", GPR, FPR32>;
 
-def FCVT_S_L : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.l">,
-               Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]> {
-  let rs2 = 0b00010;
-}
+def FCVT_S_L : FPUnaryOp_r_frm<0b1101000, 0b00010, FPR32, GPR, "fcvt.s.l">,
+               Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]>;
 def          : FPUnaryOpDynFrmAlias<FCVT_S_L, "fcvt.s.l", FPR32, GPR>;
 
-def FCVT_S_LU : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.lu">,
-                Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]> {
-  let rs2 = 0b00011;
-}
+def FCVT_S_LU : FPUnaryOp_r_frm<0b1101000, 0b00011, FPR32, GPR, "fcvt.s.lu">,
+                Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]>;
 def           : FPUnaryOpDynFrmAlias<FCVT_S_LU, "fcvt.s.lu", FPR32, GPR>;
 } // Predicates = [HasStdExtF, IsRV64]
 
@@ -320,12 +327,12 @@ def : Pat<(f32 (fpimm0)), (FMV_W_X X0)>;
 
 /// Float arithmetic operations
 
-def : PatFpr32Fpr32DynFrm<fadd, FADD_S>;
-def : PatFpr32Fpr32DynFrm<fsub, FSUB_S>;
-def : PatFpr32Fpr32DynFrm<fmul, FMUL_S>;
-def : PatFpr32Fpr32DynFrm<fdiv, FDIV_S>;
+def : PatFpr32Fpr32DynFrm<any_fadd, FADD_S>;
+def : PatFpr32Fpr32DynFrm<any_fsub, FSUB_S>;
+def : PatFpr32Fpr32DynFrm<any_fmul, FMUL_S>;
+def : PatFpr32Fpr32DynFrm<any_fdiv, FDIV_S>;
 
-def : Pat<(fsqrt FPR32:$rs1), (FSQRT_S FPR32:$rs1, 0b111)>;
+def : Pat<(any_fsqrt FPR32:$rs1), (FSQRT_S FPR32:$rs1, 0b111)>;
 
 def : Pat<(fneg FPR32:$rs1), (FSGNJN_S $rs1, $rs1)>;
 def : Pat<(fabs FPR32:$rs1), (FSGNJX_S $rs1, $rs1)>;
@@ -334,19 +341,19 @@ def : PatFpr32Fpr32<fcopysign, FSGNJ_S>;
 def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), (FSGNJN_S $rs1, $rs2)>;
 
 // fmadd: rs1 * rs2 + rs3
-def : Pat<(fma FPR32:$rs1, FPR32:$rs2, FPR32:$rs3),
+def : Pat<(any_fma FPR32:$rs1, FPR32:$rs2, FPR32:$rs3),
           (FMADD_S $rs1, $rs2, $rs3, 0b111)>;
 
 // fmsub: rs1 * rs2 - rs3
-def : Pat<(fma FPR32:$rs1, FPR32:$rs2, (fneg FPR32:$rs3)),
+def : Pat<(any_fma FPR32:$rs1, FPR32:$rs2, (fneg FPR32:$rs3)),
           (FMSUB_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>;
 
 // fnmsub: -rs1 * rs2 + rs3
-def : Pat<(fma (fneg FPR32:$rs1), FPR32:$rs2, FPR32:$rs3),
+def : Pat<(any_fma (fneg FPR32:$rs1), FPR32:$rs2, FPR32:$rs3),
           (FNMSUB_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>;
 
 // fnmadd: -rs1 * rs2 - rs3
-def : Pat<(fma (fneg FPR32:$rs1), FPR32:$rs2, (fneg FPR32:$rs3)),
+def : Pat<(any_fma (fneg FPR32:$rs1), FPR32:$rs2, (fneg FPR32:$rs3)),
           (FNMADD_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>;
 
 // The ratified 20191213 ISA spec defines fmin and fmax in a way that matches
@@ -382,8 +389,8 @@ def : Pat<(bitconvert (i32 GPR:$rs1)), (FMV_W_X GPR:$rs1)>;
 def : Pat<(i32 (bitconvert FPR32:$rs1)), (FMV_X_W FPR32:$rs1)>;
 
 // float->[u]int. Round-to-zero must be used.
-def : Pat<(i32 (fp_to_sint FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>;
-def : Pat<(i32 (fp_to_uint FPR32:$rs1)), (FCVT_WU_S $rs1, 0b001)>;
+def : Pat<(i32 (any_fp_to_sint FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>;
+def : Pat<(i32 (any_fp_to_uint FPR32:$rs1)), (FCVT_WU_S $rs1, 0b001)>;
 
 // Saturating float->[u]int32.
 def : Pat<(i32 (riscv_fcvt_x_rtz FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>;
@@ -396,8 +403,8 @@ def : Pat<(i32 (lrint FPR32:$rs1)), (FCVT_W_S $rs1, 0b111)>;
 def : Pat<(i32 (lround FPR32:$rs1)), (FCVT_W_S $rs1, 0b100)>;
 
 // [u]int->float. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp (i32 GPR:$rs1)), (FCVT_S_W $rs1, 0b111)>;
-def : Pat<(uint_to_fp (i32 GPR:$rs1)), (FCVT_S_WU $rs1, 0b111)>;
+def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_S_W $rs1, 0b111)>;
+def : Pat<(any_uint_to_fp (i32 GPR:$rs1)), (FCVT_S_WU $rs1, 0b111)>;
 } // Predicates = [HasStdExtF, IsRV32]
 
 let Predicates = [HasStdExtF, IsRV64] in {
@@ -410,12 +417,12 @@ def : Pat<(sext_inreg (riscv_fmv_x_anyextw_rv64 FPR32:$src), i32),
 // Use target specific isd nodes to help us remember the result is sign
 // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
 // duplicated if it has another user that didn't need the sign_extend.
-def : Pat<(riscv_fcvt_w_rtz_rv64 FPR32:$rs1),  (FCVT_W_S $rs1, 0b001)>;
-def : Pat<(riscv_fcvt_wu_rtz_rv64 FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
+def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR32:$rs1),  (FCVT_W_S $rs1, 0b001)>;
+def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
 
 // float->[u]int64. Round-to-zero must be used.
-def : Pat<(i64 (fp_to_sint FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>;
-def : Pat<(i64 (fp_to_uint FPR32:$rs1)), (FCVT_LU_S $rs1, 0b001)>;
+def : Pat<(i64 (any_fp_to_sint FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>;
+def : Pat<(i64 (any_fp_to_uint FPR32:$rs1)), (FCVT_LU_S $rs1, 0b001)>;
 
 // Saturating float->[u]int64.
 def : Pat<(i64 (riscv_fcvt_x_rtz FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>;
@@ -430,8 +437,8 @@ def : Pat<(i64 (lround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>;
 def : Pat<(i64 (llround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>;
 
 // [u]int->fp. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_S_W $rs1, 0b111)>;
-def : Pat<(uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_S_WU $rs1, 0b111)>;
-def : Pat<(sint_to_fp (i64 GPR:$rs1)), (FCVT_S_L $rs1, 0b111)>;
-def : Pat<(uint_to_fp (i64 GPR:$rs1)), (FCVT_S_LU $rs1, 0b111)>;
+def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_S_W $rs1, 0b111)>;
+def : Pat<(any_uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_S_WU $rs1, 0b111)>;
+def : Pat<(any_sint_to_fp (i64 GPR:$rs1)), (FCVT_S_L $rs1, 0b111)>;
+def : Pat<(any_uint_to_fp (i64 GPR:$rs1)), (FCVT_S_LU $rs1, 0b111)>;
 } // Predicates = [HasStdExtF, IsRV64]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index a037dbf585ce..b62e23d3b0fa 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -96,14 +96,6 @@ def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))),
           (REMW GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtM, IsRV64]
 
-// Pattern to detect constants with no more than 32 active bits that can't
-// be materialized with lui+addiw.
-def uimm32_not_simm32 : PatLeaf<(XLenVT GPR:$a), [{
-  auto *C = dyn_cast<ConstantSDNode>(N);
-  return C && C->hasOneUse() && isUInt<32>(C->getZExtValue()) &&
-         !isInt<32>(C->getSExtValue());
-}]>;
-
 let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in {
 // Special case for calculating the full 64-bit product of a 32x32 unsigned
 // multiply where the inputs aren't known to be zero extended. We can shift the
@@ -111,9 +103,4 @@ let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in {
 // zeroing the upper 32 bits.
 def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
           (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
-// The RHS could also be a constant that is hard to materialize. By shifting
-// left we can allow constant materialization to use LUI+ADDIW via
-// hasAllWUsers.
-def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), uimm32_not_simm32:$rs2)),
-          (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
 } // Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 3d5f9bc54731..173ae43a08d6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -338,29 +338,6 @@ class VALUVs2<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, string opcodestr>
                opcodestr, "$vd, $vs2$vm">;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
-let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in {
-// vamo vd, (rs1), vs2, vd, vm
-class VAMOWd<RISCVAMOOP amoop, RISCVWidth width, string opcodestr>
-    : RVInstVAMO<amoop, width.Value{2-0}, (outs VR:$vd_wd),
-            (ins GPR:$rs1, VR:$vs2, VR:$vd, VMaskOp:$vm),
-            opcodestr, "$vd_wd, (${rs1}), $vs2, $vd$vm"> {
-    let Constraints = "$vd_wd = $vd";
-    let wd = 1;
-    bits<5> vd;
-    let Inst{11-7} = vd;
-}
-
-// vamo x0, (rs1), vs2, vs3, vm
-class VAMONoWd<RISCVAMOOP amoop, RISCVWidth width, string opcodestr>
-    : RVInstVAMO<amoop, width.Value{2-0}, (outs),
-            (ins GPR:$rs1, VR:$vs2, VR:$vs3, VMaskOp:$vm),
-            opcodestr, "x0, (${rs1}), $vs2, $vs3$vm"> {
-    bits<5> vs3;
-    let Inst{11-7} = vs3;
-}
-
-} // hasSideEffects = 0, mayLoad = 1, mayStore = 1
-
 //===----------------------------------------------------------------------===//
 // Combination of instruction classes.
 // Use these multiclasses to define instructions more easily.
@@ -779,11 +756,6 @@ multiclass VCPR_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
            Sched<[WriteVCompressV, ReadVCompressV, ReadVCompressV]>;
 }
 
-multiclass VAMO<RISCVAMOOP amoop, RISCVWidth width, string opcodestr> {
-  def _WD : VAMOWd<amoop, width, opcodestr>;
-  def _UNWD : VAMONoWd<amoop, width, opcodestr>;
-}
-
 multiclass VWholeLoadN<bits<3> nf, string opcodestr, RegisterClass VRC> {
   foreach l = [8, 16, 32, 64] in {
     defvar w = !cast<RISCVWidth>("LSWidth" # l);
@@ -822,7 +794,7 @@ foreach eew = [8, 16, 32, 64] in {
   // Vector Strided Instructions
   def VLSE#eew#_V  : VStridedLoad<w,  "vlse"#eew#".v">, VLSSched<eew>;
   def VSSE#eew#_V  : VStridedStore<w,  "vsse"#eew#".v">, VSSSched<eew>;
-  
+
   // Vector Indexed Instructions
   def VLUXEI#eew#_V :
     VIndexedLoad<MOPLDIndexedUnord, w, "vluxei"#eew#".v">, VLXSched<eew, "U">;
@@ -1416,13 +1388,20 @@ defm VCOMPRESS_V : VCPR_MV_Mask<"vcompress", 0b010111>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
     RVVConstraint = NoConstraint in {
-foreach n = [1, 2, 4, 8] in {
-  def VMV#n#R_V  : RVInstV<0b100111, !add(n, -1), OPIVI, (outs VR:$vd),
-                           (ins VR:$vs2), "vmv" # n # "r.v", "$vd, $vs2">,
-                   VMVRSched<n> {
+def VMV1R_V  : RVInstV<0b100111, 0, OPIVI, (outs VR:$vd), (ins VR:$vs2),
+                       "vmv1r.v", "$vd, $vs2">, VMVRSched<1> {
   let Uses = [];
   let vm = 1;
 }
+// A future extension may relax the vector register alignment restrictions.
+foreach n = [2, 4, 8] in {
+  defvar vrc = !cast<VReg>("VRM"#n);
+  def VMV#n#R_V  : RVInstV<0b100111, !add(n, -1), OPIVI, (outs vrc:$vd),
+                           (ins vrc:$vs2), "vmv" # n # "r.v", "$vd, $vs2">,
+                   VMVRSched<n> {
+    let Uses = [];
+    let vm = 1;
+  }
 }
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 } // Predicates = [HasStdExtV]
@@ -1462,31 +1441,4 @@ let Predicates = [HasStdExtZvlsseg] in {
   }
 } // Predicates = [HasStdExtZvlsseg]
 
-let Predicates = [HasStdExtZvamo, HasStdExtA] in {
-  foreach eew = [8, 16, 32] in {
-    defvar w = !cast<RISCVWidth>("LSWidth"#eew);
-    defm VAMOSWAPEI#eew : VAMO<AMOOPVamoSwap, w, "vamoswapei"#eew#".v">;
-    defm VAMOADDEI#eew : VAMO<AMOOPVamoAdd, w, "vamoaddei"#eew#".v">;
-    defm VAMOXOREI#eew : VAMO<AMOOPVamoXor, w, "vamoxorei"#eew#".v">;
-    defm VAMOANDEI#eew : VAMO<AMOOPVamoAnd, w, "vamoandei"#eew#".v">;
-    defm VAMOOREI#eew : VAMO<AMOOPVamoOr, w, "vamoorei"#eew#".v">;
-    defm VAMOMINEI#eew : VAMO<AMOOPVamoMin, w, "vamominei"#eew#".v">;
-    defm VAMOMAXEI#eew : VAMO<AMOOPVamoMax, w, "vamomaxei"#eew#".v">;
-    defm VAMOMINUEI#eew : VAMO<AMOOPVamoMinu, w, "vamominuei"#eew#".v">;
-    defm VAMOMAXUEI#eew : VAMO<AMOOPVamoMaxu, w, "vamomaxuei"#eew#".v">;
-  }
-} // Predicates = [HasStdExtZvamo, HasStdExtA]
-
-let Predicates = [HasStdExtZvamo, HasStdExtA, IsRV64] in {
-  defm VAMOSWAPEI64 : VAMO<AMOOPVamoSwap, LSWidth64, "vamoswapei64.v">;
-  defm VAMOADDEI64 : VAMO<AMOOPVamoAdd, LSWidth64, "vamoaddei64.v">;
-  defm VAMOXOREI64 : VAMO<AMOOPVamoXor, LSWidth64, "vamoxorei64.v">;
-  defm VAMOANDEI64 : VAMO<AMOOPVamoAnd, LSWidth64, "vamoandei64.v">;
-  defm VAMOOREI64 : VAMO<AMOOPVamoOr, LSWidth64, "vamoorei64.v">;
-  defm VAMOMINEI64 : VAMO<AMOOPVamoMin, LSWidth64, "vamominei64.v">;
-  defm VAMOMAXEI64 : VAMO<AMOOPVamoMax, LSWidth64, "vamomaxei64.v">;
-  defm VAMOMINUEI64 : VAMO<AMOOPVamoMinu, LSWidth64, "vamominuei64.v">;
-  defm VAMOMAXUEI64 : VAMO<AMOOPVamoMaxu, LSWidth64, "vamomaxuei64.v">;
-} // Predicates = [HasStdExtZvamo, HasStdExtA, IsRV64]
-
 include "RISCVInstrInfoVPseudos.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index a82e333e6bab..073fa605e0fb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -1124,68 +1124,6 @@ class VPseudoTernaryNoMaskWithPolicy<VReg RetClass,
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
-class VPseudoAMOWDNoMask<VReg RetClass,
-                         VReg Op1Class> :
-        Pseudo<(outs GetVRegNoV0<RetClass>.R:$vd_wd),
-               (ins GPR:$rs1,
-                    Op1Class:$vs2,
-                    GetVRegNoV0<RetClass>.R:$vd,
-                    AVL:$vl, ixlenimm:$sew), []>,
-        RISCVVPseudo {
-  let mayLoad = 1;
-  let mayStore = 1;
-  let hasSideEffects = 1;
-  let Constraints = "$vd_wd = $vd";
-  let HasVLOp = 1;
-  let HasSEWOp = 1;
-  let HasDummyMask = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
-}
-
-class VPseudoAMOWDMask<VReg RetClass,
-                       VReg Op1Class> :
-        Pseudo<(outs GetVRegNoV0<RetClass>.R:$vd_wd),
-               (ins GPR:$rs1,
-                    Op1Class:$vs2,
-                    GetVRegNoV0<RetClass>.R:$vd,
-                    VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
-        RISCVVPseudo {
-  let mayLoad = 1;
-  let mayStore = 1;
-  let hasSideEffects = 1;
-  let Constraints = "$vd_wd = $vd";
-  let HasVLOp = 1;
-  let HasSEWOp = 1;
-  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
-}
-
-multiclass VPseudoAMOEI<int eew> {
-  // Standard scalar AMO supports 32, 64, and 128 Mem data bits,
-  // and in the base vector "V" extension, only SEW up to ELEN = max(XLEN, FLEN)
-  // are required to be supported.
-  // therefore only [32, 64] is allowed here.
-  foreach sew = [32, 64] in {
-    foreach lmul = MxSet<sew>.m in {
-      defvar octuple_lmul = lmul.octuple;
-      // Calculate emul = eew * lmul / sew
-      defvar octuple_emul = !srl(!mul(eew, octuple_lmul), log2<sew>.val);
-      if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
-        defvar emulMX = octuple_to_str<octuple_emul>.ret;
-        defvar emul= !cast<LMULInfo>("V_" # emulMX);
-        let VLMul = lmul.value in {
-          def "_WD_" # lmul.MX # "_" # emulMX : VPseudoAMOWDNoMask<lmul.vrclass, emul.vrclass>;
-          def "_WD_" # lmul.MX # "_" # emulMX # "_MASK" : VPseudoAMOWDMask<lmul.vrclass, emul.vrclass>;
-        }
-      }
-    }
-  }
-}
-
-multiclass VPseudoAMO {
-  foreach eew = EEWList in
-  defm "EI" # eew : VPseudoAMOEI<eew>;
-}
-
 class VPseudoUSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF, bit isFF>:
       Pseudo<(outs RetClass:$rd),
              (ins GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
@@ -1376,17 +1314,35 @@ class VPseudoISegStoreMask<VReg ValClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
-multiclass VPseudoUSLoad<bit isFF> {
+multiclass VPseudoUSLoad {
   foreach eew = EEWList in {
     foreach lmul = MxSet<eew>.m in {
       defvar LInfo = lmul.MX;
       defvar vreg = lmul.vrclass;
-      defvar FFStr = !if(isFF, "FF", "");
       let VLMul = lmul.value in {
-        def "E" # eew # FFStr # "_V_" # LInfo :
-          VPseudoUSLoadNoMask<vreg, eew, isFF>;
-        def "E" # eew # FFStr # "_V_" # LInfo # "_MASK" :
-          VPseudoUSLoadMask<vreg, eew, isFF>;
+        def "E" # eew # "_V_" # LInfo :
+          VPseudoUSLoadNoMask<vreg, eew, false>,
+          VLESched<eew>;
+        def "E" # eew # "_V_" # LInfo # "_MASK" :
+          VPseudoUSLoadMask<vreg, eew, false>,
+          VLESched<eew>;
+      }
+    }
+  }
+}
+
+multiclass VPseudoFFLoad {
+  foreach eew = EEWList in {
+    foreach lmul = MxSet<eew>.m in {
+      defvar LInfo = lmul.MX;
+      defvar vreg = lmul.vrclass;
+      let VLMul = lmul.value in {
+        def "E" # eew # "FF_V_" # LInfo :
+          VPseudoUSLoadNoMask<vreg, eew, true>,
+          VLFSched<eew>;
+        def "E" # eew # "FF_V_" # LInfo # "_MASK" :
+          VPseudoUSLoadMask<vreg, eew, true>,
+          VLFSched<eew>;
       }
     }
   }
@@ -1406,8 +1362,10 @@ multiclass VPseudoSLoad {
       defvar LInfo = lmul.MX;
       defvar vreg = lmul.vrclass;
       let VLMul = lmul.value in {
-        def "E" # eew # "_V_" # LInfo : VPseudoSLoadNoMask<vreg, eew>;
-        def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSLoadMask<vreg, eew>;
+        def "E" # eew # "_V_" # LInfo : VPseudoSLoadNoMask<vreg, eew>,
+                                        VLSSched<eew>;
+        def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSLoadMask<vreg, eew>,
+                                                  VLSSched<eew>;
       }
     }
   }
@@ -1427,11 +1385,14 @@ multiclass VPseudoILoad<bit Ordered> {
           defvar Vreg = lmul.vrclass;
           defvar IdxVreg = idx_lmul.vrclass;
           defvar HasConstraint = !ne(sew, eew);
+          defvar Order = !if(Ordered, "O", "U");
           let VLMul = lmul.value in {
             def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo :
-              VPseudoILoadNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>;
+              VPseudoILoadNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>,
+              VLXSched<eew, Order>;
             def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo # "_MASK" :
-              VPseudoILoadMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>;
+              VPseudoILoadMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>,
+              VLXSched<eew, Order>;
           }
         }
       }
@@ -1445,8 +1406,10 @@ multiclass VPseudoUSStore {
       defvar LInfo = lmul.MX;
       defvar vreg = lmul.vrclass;
       let VLMul = lmul.value in {
-        def "E" # eew # "_V_" # LInfo : VPseudoUSStoreNoMask<vreg, eew>;
-        def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoUSStoreMask<vreg, eew>;
+        def "E" # eew # "_V_" # LInfo : VPseudoUSStoreNoMask<vreg, eew>,
+                                        VSESched<eew>;
+        def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoUSStoreMask<vreg, eew>,
+                                                  VSESched<eew>;
       }
     }
   }
@@ -1466,8 +1429,10 @@ multiclass VPseudoSStore {
       defvar LInfo = lmul.MX;
       defvar vreg = lmul.vrclass;
       let VLMul = lmul.value in {
-        def "E" # eew # "_V_" # LInfo : VPseudoSStoreNoMask<vreg, eew>;
-        def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSStoreMask<vreg, eew>;
+        def "E" # eew # "_V_" # LInfo : VPseudoSStoreNoMask<vreg, eew>,
+                                        VSSSched<eew>;
+        def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSStoreMask<vreg, eew>,
+                                                  VSSSched<eew>;
       }
     }
   }
@@ -1486,11 +1451,14 @@ multiclass VPseudoIStore<bit Ordered> {
           defvar idx_lmul = !cast<LMULInfo>("V_" # IdxLInfo);
           defvar Vreg = lmul.vrclass;
           defvar IdxVreg = idx_lmul.vrclass;
+          defvar Order = !if(Ordered, "O", "U");
           let VLMul = lmul.value in {
             def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo :
-              VPseudoIStoreNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered>;
+              VPseudoIStoreNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered>,
+              VSXSched<eew, Order>;
             def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo # "_MASK" :
-              VPseudoIStoreMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered>;
+              VPseudoIStoreMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered>,
+              VSXSched<eew, Order>;
           }
         }
       }
@@ -1498,32 +1466,50 @@ multiclass VPseudoIStore<bit Ordered> {
   }
 }
 
-multiclass VPseudoUnaryS_M {
+multiclass VPseudoVPOP_M {
   foreach mti = AllMasks in
   {
     let VLMul = mti.LMul.value in {
-      def "_M_" # mti.BX : VPseudoUnaryNoMask<GPR, VR>;
-      def "_M_" # mti.BX # "_MASK" : VPseudoMaskUnarySOutMask;
+      def "_M_" # mti.BX : VPseudoUnaryNoMask<GPR, VR>,
+                           Sched<[WriteVMPopV, ReadVMPopV, ReadVMPopV]>;
+      def "_M_" # mti.BX # "_MASK" : VPseudoMaskUnarySOutMask,
+                                     Sched<[WriteVMPopV, ReadVMPopV, ReadVMPopV]>;
     }
   }
 }
 
-multiclass VPseudoUnaryM_M {
+multiclass VPseudoV1ST_M {
+  foreach mti = AllMasks in
+  {
+    let VLMul = mti.LMul.value in {
+      def "_M_" # mti.BX : VPseudoUnaryNoMask<GPR, VR>,
+                           Sched<[WriteVMFFSV, ReadVMFFSV, ReadVMFFSV]>;
+      def "_M_" # mti.BX # "_MASK" : VPseudoMaskUnarySOutMask,
+                                     Sched<[WriteVMFFSV, ReadVMFFSV, ReadVMFFSV]>;
+    }
+  }
+}
+
+multiclass VPseudoVSFS_M {
   defvar constraint = "@earlyclobber $rd";
   foreach mti = AllMasks in
   {
     let VLMul = mti.LMul.value in {
-      def "_M_" # mti.BX : VPseudoUnaryNoMask<VR, VR, constraint>;
-      def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>;
+      def "_M_" # mti.BX : VPseudoUnaryNoMask<VR, VR, constraint>,
+                           Sched<[WriteVMSFSV, ReadVMSFSV, ReadVMask]>;
+      def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>,
+                                     Sched<[WriteVMSFSV, ReadVMSFSV, ReadVMask]>;
     }
   }
 }
 
-multiclass VPseudoMaskNullaryV {
+multiclass VPseudoVID_V {
   foreach m = MxList.m in {
     let VLMul = m.value in {
-      def "_V_" # m.MX : VPseudoNullaryNoMask<m.vrclass>;
-      def "_V_" # m.MX # "_MASK" : VPseudoNullaryMask<m.vrclass>;
+      def "_V_" # m.MX : VPseudoNullaryNoMask<m.vrclass>,
+                         Sched<[WriteVMIdxV, ReadVMask]>;
+      def "_V_" # m.MX # "_MASK" : VPseudoNullaryMask<m.vrclass>,
+                                   Sched<[WriteVMIdxV, ReadVMask]>;
     }
   }
 }
@@ -1536,20 +1522,23 @@ multiclass VPseudoNullaryPseudoM <string BaseInst> {
   }
 }
 
-multiclass VPseudoUnaryV_M {
+multiclass VPseudoVIOT_M {
   defvar constraint = "@earlyclobber $rd";
   foreach m = MxList.m in {
     let VLMul = m.value in {
-      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, VR, constraint>;
-      def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, VR, constraint>;
+      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, VR, constraint>,
+                       Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>;
+      def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, VR, constraint>,
+                                 Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>;
     }
   }
 }
 
-multiclass VPseudoUnaryV_V_AnyMask {
+multiclass VPseudoVCPR_V {
   foreach m = MxList.m in {
     let VLMul = m.value in
-      def _VM # "_" # m.MX : VPseudoUnaryAnyMask<m.vrclass, m.vrclass>;
+      def _VM # "_" # m.MX : VPseudoUnaryAnyMask<m.vrclass, m.vrclass>,
+                             Sched<[WriteVCompressV, ReadVCompressV, ReadVCompressV]>;
   }
 }
 
@@ -1611,7 +1600,7 @@ multiclass VPseudoBinaryV_VV<string Constraint = ""> {
     defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
 }
 
-multiclass VPseudoBinaryV_VV_EEW<int eew, string Constraint = ""> {
+multiclass VPseudoVGTR_VV_EEW<int eew, string Constraint = ""> {
   foreach m = MxList.m in {
     foreach sew = EEWList in {
       defvar octuple_lmul = m.octuple;
@@ -1620,7 +1609,8 @@ multiclass VPseudoBinaryV_VV_EEW<int eew, string Constraint = ""> {
       if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
         defvar emulMX = octuple_to_str<octuple_emul>.ret;
         defvar emul = !cast<LMULInfo>("V_" # emulMX);
-        defm _VV : VPseudoBinaryEmul<m.vrclass, m.vrclass, emul.vrclass, m, emul, Constraint>;
+        defm _VV : VPseudoBinaryEmul<m.vrclass, m.vrclass, emul.vrclass, m, emul, Constraint>,
+                   Sched<[WriteVGatherV, ReadVGatherV, ReadVGatherV]>;
       }
     }
   }
@@ -1631,6 +1621,12 @@ multiclass VPseudoBinaryV_VX<string Constraint = ""> {
     defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>;
 }
 
+multiclass VPseudoVSLD1_VX<string Constraint = ""> {
+  foreach m = MxList.m in
+    defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>,
+                 Sched<[WriteVISlide1X, ReadVISlideV, ReadVISlideX, ReadVMask]>;
+}
+
 multiclass VPseudoBinaryV_VF<string Constraint = ""> {
   foreach m = MxList.m in
     foreach f = FPList.fpinfo in
@@ -1638,15 +1634,24 @@ multiclass VPseudoBinaryV_VF<string Constraint = ""> {
                                        f.fprclass, m, Constraint>;
 }
 
+multiclass VPseudoVSLD1_VF<string Constraint = ""> {
+  foreach m = MxList.m in
+    foreach f = FPList.fpinfo in
+      defm "_V" # f.FX :
+        VPseudoBinary<m.vrclass, m.vrclass, f.fprclass, m, Constraint>,
+        Sched<[WriteVFSlide1F, ReadVFSlideV, ReadVFSlideF, ReadVMask]>;
+}
+
 multiclass VPseudoBinaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
   foreach m = MxList.m in
     defm _VI : VPseudoBinary<m.vrclass, m.vrclass, ImmType, m, Constraint>;
 }
 
-multiclass VPseudoBinaryM_MM {
+multiclass VPseudoVALU_MM {
   foreach m = MxList.m in
     let VLMul = m.value in {
-      def "_MM_" # m.MX : VPseudoBinaryNoMask<VR, VR, VR, "">;
+      def "_MM_" # m.MX : VPseudoBinaryNoMask<VR, VR, VR, "">,
+                          Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>;
     }
 }
 
@@ -1744,12 +1749,13 @@ multiclass VPseudoBinaryV_XM<bit CarryOut = 0, bit CarryIn = 1,
                            m.vrclass, GPR, m, CarryIn, Constraint>;
 }
 
-multiclass VPseudoBinaryV_FM {
+multiclass VPseudoVMRG_FM {
   foreach m = MxList.m in
     foreach f = FPList.fpinfo in
       def "_V" # f.FX # "M_" # m.MX :
         VPseudoBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
-                             m.vrclass, f.fprclass, m, /*CarryIn=*/1, "">;
+                             m.vrclass, f.fprclass, m, /*CarryIn=*/1, "">,
+        Sched<[WriteVFMergeV, ReadVFMergeV, ReadVFMergeF, ReadVMask]>;
 }
 
 multiclass VPseudoBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1,
@@ -1762,76 +1768,102 @@ multiclass VPseudoBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1,
                            m.vrclass, simm5, m, CarryIn, Constraint>;
 }
 
-multiclass VPseudoUnaryV_V_X_I_NoDummyMask {
+multiclass VPseudoUnaryVMV_V_X_I {
   foreach m = MxList.m in {
     let VLMul = m.value in {
-      def "_V_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, m.vrclass>;
-      def "_X_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, GPR>;
-      def "_I_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, simm5>;
+      def "_V_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, m.vrclass>,
+                         Sched<[WriteVIMovV, ReadVIMovV]>;
+      def "_X_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, GPR>,
+                         Sched<[WriteVIMovX, ReadVIMovX]>;
+      def "_I_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, simm5>,
+                         Sched<[WriteVIMovI]>;
     }
   }
 }
 
-multiclass VPseudoUnaryV_F_NoDummyMask {
+multiclass VPseudoVMV_F {
   foreach m = MxList.m in {
     foreach f = FPList.fpinfo in {
       let VLMul = m.value in {
-        def "_" # f.FX # "_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, f.fprclass>;
+        def "_" # f.FX # "_" # m.MX :
+          VPseudoUnaryNoDummyMask<m.vrclass, f.fprclass>,
+          Sched<[WriteVFMovV, ReadVFMovF]>;
       }
     }
   }
 }
 
-multiclass VPseudoUnaryTAV_V {
+multiclass VPseudoVCLS_V {
   foreach m = MxList.m in {
     let VLMul = m.value in {
-      def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>;
-      def "_V_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.vrclass>;
+      def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
+                         Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>;
+      def "_V_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>,
+                                   Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>;
     }
   }
 }
 
-multiclass VPseudoUnaryV_V {
+multiclass VPseudoVSQR_V {
   foreach m = MxList.m in {
     let VLMul = m.value in {
-      def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>;
-      def "_V_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>;
+      def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
+                         Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>;
+      def "_V_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.vrclass>,
+                                   Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>;
     }
   }
 }
 
-multiclass PseudoUnaryV_VF2 {
+multiclass VPseudoVRCP_V {
+  foreach m = MxList.m in {
+    let VLMul = m.value in {
+      def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
+                         Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>;
+      def "_V_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.vrclass>,
+                                   Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>;
+    }
+  }
+}
+
+multiclass PseudoVEXT_VF2 {
   defvar constraints = "@earlyclobber $rd";
   foreach m = MxListVF2.m in
   {
     let VLMul = m.value in {
-      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>;
-      def "_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.f2vrclass,
-                                                    constraints>;
+      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>,
+                       Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
+      def "_" # m.MX # "_MASK" :
+        VPseudoUnaryMaskTA<m.vrclass, m.f2vrclass, constraints>,
+        Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
     }
   }
 }
 
-multiclass PseudoUnaryV_VF4 {
+multiclass PseudoVEXT_VF4 {
   defvar constraints = "@earlyclobber $rd";
   foreach m = MxListVF4.m in
   {
     let VLMul = m.value in {
-      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>;
-      def "_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.f4vrclass,
-                                                    constraints>;
+      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>,
+                       Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
+      def "_" # m.MX # "_MASK" :
+        VPseudoUnaryMaskTA<m.vrclass, m.f4vrclass, constraints>,
+        Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
     }
   }
 }
 
-multiclass PseudoUnaryV_VF8 {
+multiclass PseudoVEXT_VF8 {
   defvar constraints = "@earlyclobber $rd";
   foreach m = MxListVF8.m in
   {
     let VLMul = m.value in {
-      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>;
-      def "_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.f8vrclass,
-                                                    constraints>;
+      def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>,
+                       Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
+      def "_" # m.MX # "_MASK" :
+        VPseudoUnaryMaskTA<m.vrclass, m.f8vrclass, constraints>,
+        Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
     }
   }
 }
@@ -1874,30 +1906,172 @@ multiclass VPseudoBinaryM_VI {
                               !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>;
 }
 
-multiclass VPseudoBinaryV_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
-  defm "" : VPseudoBinaryV_VV<Constraint>;
-  defm "" : VPseudoBinaryV_VX<Constraint>;
-  defm "" : VPseudoBinaryV_VI<ImmType, Constraint>;
+multiclass VPseudoVGTR_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+  defm "" : VPseudoBinaryV_VV<Constraint>,
+            Sched<[WriteVGatherV, ReadVGatherV, ReadVGatherV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX<Constraint>,
+            Sched<[WriteVGatherX, ReadVGatherV, ReadVGatherX, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VI<ImmType, Constraint>,
+            Sched<[WriteVGatherI, ReadVGatherV, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryV_VV_VX {
-  defm "" : VPseudoBinaryV_VV;
-  defm "" : VPseudoBinaryV_VX;
+multiclass VPseudoVSALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+  defm "" : VPseudoBinaryV_VV<Constraint>,
+            Sched<[WriteVSALUV, ReadVSALUV, ReadVSALUV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX<Constraint>,
+            Sched<[WriteVSALUX, ReadVSALUV, ReadVSALUX, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VI<ImmType, Constraint>,
+            Sched<[WriteVSALUI, ReadVSALUV, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryV_VV_VF {
-  defm "" : VPseudoBinaryV_VV;
-  defm "" : VPseudoBinaryV_VF;
+
+multiclass VPseudoVSHT_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+  defm "" : VPseudoBinaryV_VV<Constraint>,
+            Sched<[WriteVShiftV, ReadVShiftV, ReadVShiftV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX<Constraint>,
+            Sched<[WriteVShiftX, ReadVShiftV, ReadVShiftX, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VI<ImmType, Constraint>,
+            Sched<[WriteVShiftI, ReadVShiftV, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryV_VX_VI<Operand ImmType = simm5> {
-  defm "" : VPseudoBinaryV_VX;
-  defm "" : VPseudoBinaryV_VI<ImmType>;
+multiclass VPseudoVSSHT_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+  defm "" : VPseudoBinaryV_VV<Constraint>,
+            Sched<[WriteVSShiftV, ReadVSShiftV, ReadVSShiftV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX<Constraint>,
+            Sched<[WriteVSShiftX, ReadVSShiftV, ReadVSShiftX, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VI<ImmType, Constraint>,
+            Sched<[WriteVSShiftI, ReadVSShiftV, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryW_VV_VX {
-  defm "" : VPseudoBinaryW_VV;
-  defm "" : VPseudoBinaryW_VX;
+multiclass VPseudoVALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+  defm "" : VPseudoBinaryV_VV<Constraint>,
+            Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX<Constraint>,
+            Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VI<ImmType, Constraint>,
+            Sched<[WriteVIALUI, ReadVIALUV, ReadVMask]>;
+}
+
+multiclass VPseudoVSALU_VV_VX {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVSALUV, ReadVSALUV, ReadVSALUV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX,
+            Sched<[WriteVSALUX, ReadVSALUV, ReadVSALUX, ReadVMask]>;
+}
+
+multiclass VPseudoVSMUL_VV_VX {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVSMulV, ReadVSMulV, ReadVSMulV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX,
+            Sched<[WriteVSMulX, ReadVSMulV, ReadVSMulX, ReadVMask]>;
+}
+
+multiclass VPseudoVAALU_VV_VX {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVAALUV, ReadVAALUV, ReadVAALUV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX,
+            Sched<[WriteVAALUX, ReadVAALUV, ReadVAALUX, ReadVMask]>;
+}
+
+multiclass VPseudoVMINMAX_VV_VX {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX,
+            Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+}
+
+multiclass VPseudoVMUL_VV_VX {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVIMulV, ReadVIMulV, ReadVIMulV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX,
+            Sched<[WriteVIMulX, ReadVIMulV, ReadVIMulX, ReadVMask]>;
+}
+
+multiclass VPseudoVDIV_VV_VX {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVIDivV, ReadVIDivV, ReadVIDivV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX,
+            Sched<[WriteVIDivX, ReadVIDivV, ReadVIDivX, ReadVMask]>;
+}
+
+multiclass VPseudoVFMUL_VV_VF {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVFMulV, ReadVFMulV, ReadVFMulV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VF,
+            Sched<[WriteVFMulF, ReadVFMulV, ReadVFMulF, ReadVMask]>;
+}
+
+multiclass VPseudoVFDIV_VV_VF {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVFDivV, ReadVFDivV, ReadVFDivV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VF,
+            Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>;
+}
+
+multiclass VPseudoVFRDIV_VF {
+  defm "" : VPseudoBinaryV_VF,
+            Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>;
+}
+
+multiclass VPseudoVALU_VV_VX {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VX,
+            Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>;
+}
+
+multiclass VPseudoVSGNJ_VV_VF {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVFSgnjV, ReadVFSgnjV, ReadVFSgnjV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VF,
+            Sched<[WriteVFSgnjF, ReadVFSgnjV, ReadVFSgnjF, ReadVMask]>;
+}
+
+multiclass VPseudoVMAX_VV_VF {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVFCmpV, ReadVFCmpV, ReadVFCmpV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VF,
+            Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>;
+}
+
+multiclass VPseudoVALU_VV_VF {
+  defm "" : VPseudoBinaryV_VV,
+            Sched<[WriteVFALUV, ReadVFALUV, ReadVFALUV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VF,
+            Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>;
+}
+
+multiclass VPseudoVALU_VF {
+  defm "" : VPseudoBinaryV_VF,
+            Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>;
+}
+
+multiclass VPseudoVALU_VX_VI<Operand ImmType = simm5> {
+  defm "" : VPseudoBinaryV_VX,
+            Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>;
+  defm "" : VPseudoBinaryV_VI<ImmType>,
+            Sched<[WriteVIALUI, ReadVIALUV, ReadVMask]>;
+}
+
+multiclass VPseudoVWALU_VV_VX {
+  defm "" : VPseudoBinaryW_VV,
+            Sched<[WriteVIWALUV, ReadVIWALUV, ReadVIWALUV, ReadVMask]>;
+  defm "" : VPseudoBinaryW_VX,
+            Sched<[WriteVIWALUX, ReadVIWALUV, ReadVIWALUX, ReadVMask]>;
+}
+
+multiclass VPseudoVWMUL_VV_VX {
+  defm "" : VPseudoBinaryW_VV,
+            Sched<[WriteVIWMulV, ReadVIWMulV, ReadVIWMulV, ReadVMask]>;
+  defm "" : VPseudoBinaryW_VX,
+            Sched<[WriteVIWMulX, ReadVIWMulV, ReadVIWMulX, ReadVMask]>;
+}
+
+multiclass VPseudoVWMUL_VV_VF {
+  defm "" : VPseudoBinaryW_VV,
+            Sched<[WriteVFWMulV, ReadVFWMulV, ReadVFWMulV, ReadVMask]>;
+  defm "" : VPseudoBinaryW_VF,
+            Sched<[WriteVFWMulF, ReadVFWMulV, ReadVFWMulF, ReadVMask]>;
 }
 
 multiclass VPseudoBinaryW_VV_VF {
@@ -1905,53 +2079,100 @@ multiclass VPseudoBinaryW_VV_VF {
   defm "" : VPseudoBinaryW_VF;
 }
 
-multiclass VPseudoBinaryW_WV_WX {
-  defm "" : VPseudoBinaryW_WV;
-  defm "" : VPseudoBinaryW_WX;
+multiclass VPseudoVWALU_WV_WX {
+  defm "" : VPseudoBinaryW_WV,
+            Sched<[WriteVIWALUV, ReadVIWALUV, ReadVIWALUV, ReadVMask]>;
+  defm "" : VPseudoBinaryW_WX,
+            Sched<[WriteVIWALUX, ReadVIWALUV, ReadVIWALUX, ReadVMask]>;
+}
+
+multiclass VPseudoVFWALU_VV_VF {
+  defm "" : VPseudoBinaryW_VV,
+            Sched<[WriteVFWALUV, ReadVFWALUV, ReadVFWALUV, ReadVMask]>;
+  defm "" : VPseudoBinaryW_VF,
+            Sched<[WriteVFWALUF, ReadVFWALUV, ReadVFWALUF, ReadVMask]>;
+}
+
+multiclass VPseudoVFWALU_WV_WF {
+  defm "" : VPseudoBinaryW_WV,
+            Sched<[WriteVFWALUV, ReadVFWALUV, ReadVFWALUV, ReadVMask]>;
+  defm "" : VPseudoBinaryW_WF,
+            Sched<[WriteVFWALUF, ReadVFWALUV, ReadVFWALUF, ReadVMask]>;
+}
+
+multiclass VPseudoVMRG_VM_XM_IM {
+  defm "" : VPseudoBinaryV_VM,
+            Sched<[WriteVIMergeV, ReadVIMergeV, ReadVIMergeV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_XM,
+            Sched<[WriteVIMergeX, ReadVIMergeV, ReadVIMergeX, ReadVMask]>;
+  defm "" : VPseudoBinaryV_IM,
+            Sched<[WriteVIMergeI, ReadVIMergeV, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryW_WV_WF {
-  defm "" : VPseudoBinaryW_WV;
-  defm "" : VPseudoBinaryW_WF;
+multiclass VPseudoVCALU_VM_XM_IM {
+  defm "" : VPseudoBinaryV_VM,
+            Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_XM,
+            Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
+  defm "" : VPseudoBinaryV_IM,
+            Sched<[WriteVICALUI, ReadVIALUCV, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryV_VM_XM_IM {
-  defm "" : VPseudoBinaryV_VM;
-  defm "" : VPseudoBinaryV_XM;
-  defm "" : VPseudoBinaryV_IM;
+multiclass VPseudoVCALU_VM_XM {
+  defm "" : VPseudoBinaryV_VM,
+            Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_XM,
+            Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryV_VM_XM {
-  defm "" : VPseudoBinaryV_VM;
-  defm "" : VPseudoBinaryV_XM;
+multiclass VPseudoVCALUM_VM_XM_IM<string Constraint> {
+  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>,
+            Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>,
+            Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
+  defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>,
+            Sched<[WriteVICALUI, ReadVIALUCV, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryM_VM_XM_IM<string Constraint> {
-  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
-  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
-  defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+multiclass VPseudoVCALUM_VM_XM<string Constraint> {
+  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>,
+            Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>,
+            Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryM_VM_XM<string Constraint> {
-  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
-  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>;
+multiclass VPseudoVCALUM_V_X_I<string Constraint> {
+  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>,
+            Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV]>;
+  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>,
+            Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX]>;
+  defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>,
+            Sched<[WriteVICALUI, ReadVIALUCV]>;
 }
 
-multiclass VPseudoBinaryM_V_X_I<string Constraint> {
-  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
-  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
-  defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+multiclass VPseudoVCALUM_V_X<string Constraint> {
+  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>,
+            Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV]>;
+  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>,
+            Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX]>;
 }
 
-multiclass VPseudoBinaryM_V_X<string Constraint> {
-  defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
-  defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>;
+multiclass VPseudoVNCLP_WV_WX_WI {
+  defm "" : VPseudoBinaryV_WV,
+            Sched<[WriteVNClipV, ReadVNClipV, ReadVNClipV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_WX,
+            Sched<[WriteVNClipX, ReadVNClipV, ReadVNClipX, ReadVMask]>;
+  defm "" : VPseudoBinaryV_WI,
+            Sched<[WriteVNClipI, ReadVNClipV, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryV_WV_WX_WI {
-  defm "" : VPseudoBinaryV_WV;
-  defm "" : VPseudoBinaryV_WX;
-  defm "" : VPseudoBinaryV_WI;
+multiclass VPseudoVNSHT_WV_WX_WI {
+  defm "" : VPseudoBinaryV_WV,
+            Sched<[WriteVNShiftV, ReadVNShiftV, ReadVNShiftV, ReadVMask]>;
+  defm "" : VPseudoBinaryV_WX,
+            Sched<[WriteVNShiftX, ReadVNShiftV, ReadVNShiftX, ReadVMask]>;
+  defm "" : VPseudoBinaryV_WI,
+            Sched<[WriteVNShiftI, ReadVNShiftV, ReadVMask]>;
 }
 
 multiclass VPseudoTernary<VReg RetClass,
@@ -2031,55 +2252,113 @@ multiclass VPseudoTernaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
     defm _VI : VPseudoTernary<m.vrclass, m.vrclass, ImmType, m, Constraint>;
 }
 
-multiclass VPseudoTernaryV_VV_VX_AAXA<string Constraint = ""> {
-  defm "" : VPseudoTernaryV_VV_AAXA<Constraint>;
-  defm "" : VPseudoTernaryV_VX_AAXA<Constraint>;
+multiclass VPseudoVMAC_VV_VX_AAXA<string Constraint = ""> {
+  defm "" : VPseudoTernaryV_VV_AAXA<Constraint>,
+            Sched<[WriteVIMulAddV, ReadVIMulAddV, ReadVIMulAddV, ReadVIMulAddV, ReadVMask]>;
+  defm "" : VPseudoTernaryV_VX_AAXA<Constraint>,
+            Sched<[WriteVIMulAddX, ReadVIMulAddV, ReadVIMulAddV, ReadVIMulAddX, ReadVMask]>;
 }
 
-multiclass VPseudoTernaryV_VV_VF_AAXA<string Constraint = ""> {
-  defm "" : VPseudoTernaryV_VV_AAXA<Constraint>;
-  defm "" : VPseudoTernaryV_VF_AAXA<Constraint>;
+multiclass VPseudoVMAC_VV_VF_AAXA<string Constraint = ""> {
+  defm "" : VPseudoTernaryV_VV_AAXA<Constraint>,
+            Sched<[WriteVFMulAddV, ReadVFMulAddV, ReadVFMulAddV, ReadVFMulAddV, ReadVMask]>;
+  defm "" : VPseudoTernaryV_VF_AAXA<Constraint>,
+            Sched<[WriteVFMulAddF, ReadVFMulAddV, ReadVFMulAddV, ReadVFMulAddF, ReadVMask]>;
 }
 
-multiclass VPseudoTernaryV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
-  defm "" : VPseudoTernaryV_VX<Constraint>;
-  defm "" : VPseudoTernaryV_VI<ImmType, Constraint>;
+multiclass VPseudoVSLD_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+  defm "" : VPseudoTernaryV_VX<Constraint>,
+            Sched<[WriteVISlideX, ReadVISlideV, ReadVISlideV, ReadVISlideX, ReadVMask]>;
+  defm "" : VPseudoTernaryV_VI<ImmType, Constraint>,
+            Sched<[WriteVISlideI, ReadVISlideV, ReadVISlideV, ReadVMask]>;
 }
 
-multiclass VPseudoTernaryW_VV_VX {
-  defm "" : VPseudoTernaryW_VV;
-  defm "" : VPseudoTernaryW_VX;
+multiclass VPseudoVWMAC_VV_VX {
+  defm "" : VPseudoTernaryW_VV,
+            Sched<[WriteVIWMulAddV, ReadVIWMulAddV, ReadVIWMulAddV, ReadVIWMulAddV, ReadVMask]>;
+  defm "" : VPseudoTernaryW_VX,
+            Sched<[WriteVIWMulAddX, ReadVIWMulAddV, ReadVIWMulAddV, ReadVIWMulAddX, ReadVMask]>;
 }
 
-multiclass VPseudoTernaryW_VV_VF {
-  defm "" : VPseudoTernaryW_VV;
-  defm "" : VPseudoTernaryW_VF;
+multiclass VPseudoVWMAC_VX {
+  defm "" : VPseudoTernaryW_VX,
+            Sched<[WriteVIWMulAddX, ReadVIWMulAddV, ReadVIWMulAddV, ReadVIWMulAddX, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryM_VV_VX_VI {
-  defm "" : VPseudoBinaryM_VV;
-  defm "" : VPseudoBinaryM_VX;
-  defm "" : VPseudoBinaryM_VI;
+multiclass VPseudoVWMAC_VV_VF {
+  defm "" : VPseudoTernaryW_VV,
+            Sched<[WriteVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddV, ReadVMask]>;
+  defm "" : VPseudoTernaryW_VF,
+            Sched<[WriteVFWMulAddF, ReadVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddF, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryM_VV_VX {
-  defm "" : VPseudoBinaryM_VV;
-  defm "" : VPseudoBinaryM_VX;
+multiclass VPseudoVCMPM_VV_VX_VI {
+  defm "" : VPseudoBinaryM_VV,
+            Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>;
+  defm "" : VPseudoBinaryM_VX,
+            Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+  defm "" : VPseudoBinaryM_VI,
+            Sched<[WriteVICmpI, ReadVICmpV, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryM_VV_VF {
-  defm "" : VPseudoBinaryM_VV;
-  defm "" : VPseudoBinaryM_VF;
+multiclass VPseudoVCMPM_VV_VX {
+  defm "" : VPseudoBinaryM_VV,
+            Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>;
+  defm "" : VPseudoBinaryM_VX,
+            Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryM_VX_VI {
-  defm "" : VPseudoBinaryM_VX;
-  defm "" : VPseudoBinaryM_VI;
+multiclass VPseudoVCMPM_VV_VF {
+  defm "" : VPseudoBinaryM_VV,
+            Sched<[WriteVFCmpV, ReadVFCmpV, ReadVFCmpV, ReadVMask]>;
+  defm "" : VPseudoBinaryM_VF,
+            Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>;
 }
 
-multiclass VPseudoReductionV_VS {
+multiclass VPseudoVCMPM_VF {
+  defm "" : VPseudoBinaryM_VF,
+            Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>;
+}
+
+multiclass VPseudoVCMPM_VX_VI {
+  defm "" : VPseudoBinaryM_VX,
+            Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+  defm "" : VPseudoBinaryM_VI,
+            Sched<[WriteVICmpI, ReadVICmpV, ReadVMask]>;
+}
+
+multiclass VPseudoVRED_VS {
   foreach m = MxList.m in {
-    defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>;
+    defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>,
+               Sched<[WriteVIRedV, ReadVIRedV, ReadVIRedV, ReadVIRedV, ReadVMask]>;
+  }
+}
+
+multiclass VPseudoVWRED_VS {
+  foreach m = MxList.m in {
+    defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>,
+               Sched<[WriteVIWRedV, ReadVIWRedV, ReadVIWRedV, ReadVIWRedV, ReadVMask]>;
+  }
+}
+
+multiclass VPseudoVFRED_VS {
+  foreach m = MxList.m in {
+    defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>,
+               Sched<[WriteVFRedV, ReadVFRedV, ReadVFRedV, ReadVFRedV, ReadVMask]>;
+  }
+}
+
+multiclass VPseudoVFREDO_VS {
+  foreach m = MxList.m in {
+    defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>,
+               Sched<[WriteVFRedOV, ReadVFRedOV, ReadVFRedOV, ReadVFRedOV, ReadVMask]>;
+  }
+}
+
+multiclass VPseudoVFWRED_VS {
+  foreach m = MxList.m in {
+    defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>,
+               Sched<[WriteVFWRedV, ReadVFWRedV, ReadVFWRedV, ReadVFWRedV, ReadVMask]>;
   }
 }
 
@@ -2094,9 +2373,16 @@ multiclass VPseudoConversion<VReg RetClass,
   }
 }
 
-multiclass VPseudoConversionV_V {
+multiclass VPseudoVCVTI_V {
+  foreach m = MxList.m in
+    defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>,
+              Sched<[WriteVFCvtFToIV, ReadVFCvtFToIV, ReadVMask]>;
+}
+
+multiclass VPseudoVCVTF_V {
   foreach m = MxList.m in
-    defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>;
+    defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>,
+              Sched<[WriteVFCvtIToFV, ReadVFCvtIToFV, ReadVMask]>;
 }
 
 multiclass VPseudoConversionW_V {
@@ -2105,10 +2391,46 @@ multiclass VPseudoConversionW_V {
     defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>;
 }
 
-multiclass VPseudoConversionV_W {
+multiclass VPseudoVWCVTI_V {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxList.m[0-5] in
+    defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>,
+              Sched<[WriteVFWCvtFToIV, ReadVFWCvtFToIV, ReadVMask]>;
+}
+
+multiclass VPseudoVWCVTF_V {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxList.m[0-5] in
+    defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>,
+              Sched<[WriteVFWCvtIToFV, ReadVFWCvtIToFV, ReadVMask]>;
+}
+
+multiclass VPseudoVWCVTD_V {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxList.m[0-5] in
+    defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>,
+              Sched<[WriteVFWCvtFToFV, ReadVFWCvtFToFV, ReadVMask]>;
+}
+
+multiclass VPseudoVNCVTI_W {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxList.m[0-5] in
+    defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>,
+              Sched<[WriteVFNCvtFToIV, ReadVFNCvtFToIV, ReadVMask]>;
+}
+
+multiclass VPseudoVNCVTF_W {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxList.m[0-5] in
+    defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>,
+              Sched<[WriteVFNCvtIToFV, ReadVFNCvtIToFV, ReadVMask]>;
+}
+
+multiclass VPseudoVNCVTD_W {
   defvar constraint = "@earlyclobber $rd";
   foreach m = MxListW.m in
-    defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>;
+    defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>,
+              Sched<[WriteVFNCvtFToFV, ReadVFNCvtFToFV, ReadVMask]>;
 }
 
 multiclass VPseudoUSSegLoad<bit isFF> {
@@ -2543,42 +2865,6 @@ class VPatTernaryMask<string intrinsic,
                     (mask_type V0),
                     GPR:$vl, sew)>;
 
-class VPatAMOWDNoMask<string intrinsic_name,
-                    string inst,
-                    ValueType result_type,
-                    ValueType op1_type,
-                    int sew,
-                    LMULInfo vlmul,
-                    LMULInfo emul,
-                    VReg op1_reg_class> :
-  Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                    GPR:$rs1,
-                    (op1_type op1_reg_class:$vs2),
-                    (result_type vlmul.vrclass:$vd),
-                    VLOpFrag)),
-                   (!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX)
-                    $rs1, $vs2, $vd,
-                    GPR:$vl, sew)>;
-
-class VPatAMOWDMask<string intrinsic_name,
-                    string inst,
-                    ValueType result_type,
-                    ValueType op1_type,
-                    ValueType mask_type,
-                    int sew,
-                    LMULInfo vlmul,
-                    LMULInfo emul,
-                    VReg op1_reg_class> :
-  Pat<(result_type (!cast<Intrinsic>(intrinsic_name # "_mask")
-                    GPR:$rs1,
-                    (op1_type op1_reg_class:$vs2),
-                    (result_type vlmul.vrclass:$vd),
-                    (mask_type V0),
-                    VLOpFrag)),
-                   (!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX # "_MASK")
-                    $rs1, $vs2, $vd,
-                    (mask_type V0), GPR:$vl, sew)>;
-
 multiclass VPatUnaryS_M<string intrinsic_name,
                              string inst>
 {
@@ -3416,44 +3702,6 @@ multiclass VPatConversionVF_WF <string intrinsic, string instruction> {
   }
 }
 
-multiclass VPatAMOWD<string intrinsic,
-                     string inst,
-                     ValueType result_type,
-                     ValueType offset_type,
-                     ValueType mask_type,
-                     int sew,
-                     LMULInfo vlmul,
-                     LMULInfo emul,
-                     VReg op1_reg_class>
-{
-  def : VPatAMOWDNoMask<intrinsic, inst, result_type, offset_type,
-                        sew, vlmul, emul, op1_reg_class>;
-  def : VPatAMOWDMask<intrinsic, inst, result_type, offset_type,
-                      mask_type, sew, vlmul, emul, op1_reg_class>;
-}
-
-multiclass VPatAMOV_WD<string intrinsic,
-                       string inst,
-                       list<VTypeInfo> vtilist> {
-  foreach eew = EEWList in {
-    foreach vti = vtilist in {
-      if !or(!eq(vti.SEW, 32), !eq(vti.SEW, 64)) then {
-        defvar octuple_lmul = vti.LMul.octuple;
-        // Calculate emul = eew * lmul / sew
-        defvar octuple_emul = !srl(!mul(eew, octuple_lmul), vti.Log2SEW);
-        if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then {
-          defvar emulMX = octuple_to_str<octuple_emul>.ret;
-          defvar offsetVti = !cast<VTypeInfo>("VI" # eew # emulMX);
-          defvar inst_ei = inst # "EI" # eew;
-          defm : VPatAMOWD<intrinsic, inst_ei,
-                           vti.Vector, offsetVti.Vector,
-                           vti.Mask, vti.Log2SEW, vti.LMul, offsetVti.LMul, offsetVti.RegClass>;
-        }
-      }
-    }
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
@@ -3531,11 +3779,13 @@ def PseudoVSETIVLI : Pseudo<(outs GPR:$rd), (ins uimm5:$rs1, VTypeIOp:$vtypei),
 //===----------------------------------------------------------------------===//
 
 // Pseudos Unit-Stride Loads and Stores
-defm PseudoVL : VPseudoUSLoad</*isFF=*/false>;
+defm PseudoVL : VPseudoUSLoad;
 defm PseudoVS : VPseudoUSStore;
 
-defm PseudoVLM : VPseudoLoadMask;
-defm PseudoVSM : VPseudoStoreMask;
+defm PseudoVLM : VPseudoLoadMask,
+                 Sched<[WriteVLDM, ReadVLDX]>;
+defm PseudoVSM : VPseudoStoreMask,
+                 Sched<[WriteVSTM, ReadVSTX]>;
 
 //===----------------------------------------------------------------------===//
 // 7.5 Vector Strided Instructions
@@ -3561,7 +3811,7 @@ defm PseudoVSUX : VPseudoIStore</*Ordered=*/false>;
 
 // vleff may update VL register
 let hasSideEffects = 1, Defs = [VL] in
-defm PseudoVL : VPseudoUSLoad</*isFF=*/true>;
+defm PseudoVL : VPseudoFFLoad;
 
 //===----------------------------------------------------------------------===//
 // 7.8. Vector Load/Store Segment Instructions
@@ -3580,28 +3830,15 @@ let hasSideEffects = 1, Defs = [VL] in
 defm PseudoVLSEG : VPseudoUSSegLoad</*isFF=*/true>;
 
 //===----------------------------------------------------------------------===//
-// 8. Vector AMO Operations
-//===----------------------------------------------------------------------===//
-defm PseudoVAMOSWAP : VPseudoAMO;
-defm PseudoVAMOADD : VPseudoAMO;
-defm PseudoVAMOXOR : VPseudoAMO;
-defm PseudoVAMOAND : VPseudoAMO;
-defm PseudoVAMOOR : VPseudoAMO;
-defm PseudoVAMOMIN : VPseudoAMO;
-defm PseudoVAMOMAX : VPseudoAMO;
-defm PseudoVAMOMINU : VPseudoAMO;
-defm PseudoVAMOMAXU : VPseudoAMO;
-
-//===----------------------------------------------------------------------===//
 // 12. Vector Integer Arithmetic Instructions
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // 12.1. Vector Single-Width Integer Add and Subtract
 //===----------------------------------------------------------------------===//
-defm PseudoVADD        : VPseudoBinaryV_VV_VX_VI;
-defm PseudoVSUB        : VPseudoBinaryV_VV_VX;
-defm PseudoVRSUB       : VPseudoBinaryV_VX_VI;
+defm PseudoVADD   : VPseudoVALU_VV_VX_VI;
+defm PseudoVSUB   : VPseudoVALU_VV_VX;
+defm PseudoVRSUB  : VPseudoVALU_VX_VI;
 
 foreach vti = AllIntegerVectors in {
   // Match vrsub with 2 vector operands to vsub.vv by swapping operands. This
@@ -3657,166 +3894,166 @@ foreach vti = AllIntegerVectors in {
 //===----------------------------------------------------------------------===//
 // 12.2. Vector Widening Integer Add/Subtract
 //===----------------------------------------------------------------------===//
-defm PseudoVWADDU    : VPseudoBinaryW_VV_VX;
-defm PseudoVWSUBU    : VPseudoBinaryW_VV_VX;
-defm PseudoVWADD     : VPseudoBinaryW_VV_VX;
-defm PseudoVWSUB     : VPseudoBinaryW_VV_VX;
-defm PseudoVWADDU    : VPseudoBinaryW_WV_WX;
-defm PseudoVWSUBU    : VPseudoBinaryW_WV_WX;
-defm PseudoVWADD     : VPseudoBinaryW_WV_WX;
-defm PseudoVWSUB     : VPseudoBinaryW_WV_WX;
+defm PseudoVWADDU : VPseudoVWALU_VV_VX;
+defm PseudoVWSUBU : VPseudoVWALU_VV_VX;
+defm PseudoVWADD  : VPseudoVWALU_VV_VX;
+defm PseudoVWSUB  : VPseudoVWALU_VV_VX;
+defm PseudoVWADDU : VPseudoVWALU_WV_WX;
+defm PseudoVWSUBU : VPseudoVWALU_WV_WX;
+defm PseudoVWADD  : VPseudoVWALU_WV_WX;
+defm PseudoVWSUB  : VPseudoVWALU_WV_WX;
 
 //===----------------------------------------------------------------------===//
 // 12.3. Vector Integer Extension
 //===----------------------------------------------------------------------===//
-defm PseudoVZEXT_VF2 : PseudoUnaryV_VF2;
-defm PseudoVZEXT_VF4 : PseudoUnaryV_VF4;
-defm PseudoVZEXT_VF8 : PseudoUnaryV_VF8;
-defm PseudoVSEXT_VF2 : PseudoUnaryV_VF2;
-defm PseudoVSEXT_VF4 : PseudoUnaryV_VF4;
-defm PseudoVSEXT_VF8 : PseudoUnaryV_VF8;
+defm PseudoVZEXT_VF2 : PseudoVEXT_VF2;
+defm PseudoVZEXT_VF4 : PseudoVEXT_VF4;
+defm PseudoVZEXT_VF8 : PseudoVEXT_VF8;
+defm PseudoVSEXT_VF2 : PseudoVEXT_VF2;
+defm PseudoVSEXT_VF4 : PseudoVEXT_VF4;
+defm PseudoVSEXT_VF8 : PseudoVEXT_VF8;
 
 //===----------------------------------------------------------------------===//
 // 12.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVADC        : VPseudoBinaryV_VM_XM_IM;
-defm PseudoVMADC       : VPseudoBinaryM_VM_XM_IM<"@earlyclobber $rd">;
-defm PseudoVMADC       : VPseudoBinaryM_V_X_I<"@earlyclobber $rd">;
+defm PseudoVADC  : VPseudoVCALU_VM_XM_IM;
+defm PseudoVMADC : VPseudoVCALUM_VM_XM_IM<"@earlyclobber $rd">;
+defm PseudoVMADC : VPseudoVCALUM_V_X_I<"@earlyclobber $rd">;
 
-defm PseudoVSBC        : VPseudoBinaryV_VM_XM;
-defm PseudoVMSBC       : VPseudoBinaryM_VM_XM<"@earlyclobber $rd">;
-defm PseudoVMSBC       : VPseudoBinaryM_V_X<"@earlyclobber $rd">;
+defm PseudoVSBC  : VPseudoVCALU_VM_XM;
+defm PseudoVMSBC : VPseudoVCALUM_VM_XM<"@earlyclobber $rd">;
+defm PseudoVMSBC : VPseudoVCALUM_V_X<"@earlyclobber $rd">;
 
 //===----------------------------------------------------------------------===//
 // 12.5. Vector Bitwise Logical Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVAND        : VPseudoBinaryV_VV_VX_VI;
-defm PseudoVOR         : VPseudoBinaryV_VV_VX_VI;
-defm PseudoVXOR        : VPseudoBinaryV_VV_VX_VI;
+defm PseudoVAND : VPseudoVALU_VV_VX_VI;
+defm PseudoVOR  : VPseudoVALU_VV_VX_VI;
+defm PseudoVXOR : VPseudoVALU_VV_VX_VI;
 
 //===----------------------------------------------------------------------===//
 // 12.6. Vector Single-Width Bit Shift Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVSLL        : VPseudoBinaryV_VV_VX_VI<uimm5>;
-defm PseudoVSRL        : VPseudoBinaryV_VV_VX_VI<uimm5>;
-defm PseudoVSRA        : VPseudoBinaryV_VV_VX_VI<uimm5>;
+defm PseudoVSLL : VPseudoVSHT_VV_VX_VI<uimm5>;
+defm PseudoVSRL : VPseudoVSHT_VV_VX_VI<uimm5>;
+defm PseudoVSRA : VPseudoVSHT_VV_VX_VI<uimm5>;
 
 //===----------------------------------------------------------------------===//
 // 12.7. Vector Narrowing Integer Right Shift Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVNSRL       : VPseudoBinaryV_WV_WX_WI;
-defm PseudoVNSRA       : VPseudoBinaryV_WV_WX_WI;
+defm PseudoVNSRL : VPseudoVNSHT_WV_WX_WI;
+defm PseudoVNSRA : VPseudoVNSHT_WV_WX_WI;
 
 //===----------------------------------------------------------------------===//
 // 12.8. Vector Integer Comparison Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVMSEQ       : VPseudoBinaryM_VV_VX_VI;
-defm PseudoVMSNE       : VPseudoBinaryM_VV_VX_VI;
-defm PseudoVMSLTU      : VPseudoBinaryM_VV_VX;
-defm PseudoVMSLT       : VPseudoBinaryM_VV_VX;
-defm PseudoVMSLEU      : VPseudoBinaryM_VV_VX_VI;
-defm PseudoVMSLE       : VPseudoBinaryM_VV_VX_VI;
-defm PseudoVMSGTU      : VPseudoBinaryM_VX_VI;
-defm PseudoVMSGT       : VPseudoBinaryM_VX_VI;
+defm PseudoVMSEQ  : VPseudoVCMPM_VV_VX_VI;
+defm PseudoVMSNE  : VPseudoVCMPM_VV_VX_VI;
+defm PseudoVMSLTU : VPseudoVCMPM_VV_VX;
+defm PseudoVMSLT  : VPseudoVCMPM_VV_VX;
+defm PseudoVMSLEU : VPseudoVCMPM_VV_VX_VI;
+defm PseudoVMSLE  : VPseudoVCMPM_VV_VX_VI;
+defm PseudoVMSGTU : VPseudoVCMPM_VX_VI;
+defm PseudoVMSGT  : VPseudoVCMPM_VX_VI;
 
 //===----------------------------------------------------------------------===//
 // 12.9. Vector Integer Min/Max Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVMINU       : VPseudoBinaryV_VV_VX;
-defm PseudoVMIN        : VPseudoBinaryV_VV_VX;
-defm PseudoVMAXU       : VPseudoBinaryV_VV_VX;
-defm PseudoVMAX        : VPseudoBinaryV_VV_VX;
+defm PseudoVMINU : VPseudoVMINMAX_VV_VX;
+defm PseudoVMIN  : VPseudoVMINMAX_VV_VX;
+defm PseudoVMAXU : VPseudoVMINMAX_VV_VX;
+defm PseudoVMAX  : VPseudoVMINMAX_VV_VX;
 
 //===----------------------------------------------------------------------===//
 // 12.10. Vector Single-Width Integer Multiply Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVMUL        : VPseudoBinaryV_VV_VX;
-defm PseudoVMULH       : VPseudoBinaryV_VV_VX;
-defm PseudoVMULHU      : VPseudoBinaryV_VV_VX;
-defm PseudoVMULHSU     : VPseudoBinaryV_VV_VX;
+defm PseudoVMUL    : VPseudoVMUL_VV_VX;
+defm PseudoVMULH   : VPseudoVMUL_VV_VX;
+defm PseudoVMULHU  : VPseudoVMUL_VV_VX;
+defm PseudoVMULHSU : VPseudoVMUL_VV_VX;
 
 //===----------------------------------------------------------------------===//
 // 12.11. Vector Integer Divide Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVDIVU       : VPseudoBinaryV_VV_VX;
-defm PseudoVDIV        : VPseudoBinaryV_VV_VX;
-defm PseudoVREMU       : VPseudoBinaryV_VV_VX;
-defm PseudoVREM        : VPseudoBinaryV_VV_VX;
+defm PseudoVDIVU : VPseudoVDIV_VV_VX;
+defm PseudoVDIV  : VPseudoVDIV_VV_VX;
+defm PseudoVREMU : VPseudoVDIV_VV_VX;
+defm PseudoVREM  : VPseudoVDIV_VV_VX;
 
 //===----------------------------------------------------------------------===//
 // 12.12. Vector Widening Integer Multiply Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVWMUL       : VPseudoBinaryW_VV_VX;
-defm PseudoVWMULU      : VPseudoBinaryW_VV_VX;
-defm PseudoVWMULSU     : VPseudoBinaryW_VV_VX;
+defm PseudoVWMUL   : VPseudoVWMUL_VV_VX;
+defm PseudoVWMULU  : VPseudoVWMUL_VV_VX;
+defm PseudoVWMULSU : VPseudoVWMUL_VV_VX;
 
 //===----------------------------------------------------------------------===//
 // 12.13. Vector Single-Width Integer Multiply-Add Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVMACC       : VPseudoTernaryV_VV_VX_AAXA;
-defm PseudoVNMSAC      : VPseudoTernaryV_VV_VX_AAXA;
-defm PseudoVMADD       : VPseudoTernaryV_VV_VX_AAXA;
-defm PseudoVNMSUB      : VPseudoTernaryV_VV_VX_AAXA;
+defm PseudoVMACC  : VPseudoVMAC_VV_VX_AAXA;
+defm PseudoVNMSAC : VPseudoVMAC_VV_VX_AAXA;
+defm PseudoVMADD  : VPseudoVMAC_VV_VX_AAXA;
+defm PseudoVNMSUB : VPseudoVMAC_VV_VX_AAXA;
 
 //===----------------------------------------------------------------------===//
 // 12.14. Vector Widening Integer Multiply-Add Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVWMACCU     : VPseudoTernaryW_VV_VX;
-defm PseudoVWMACC      : VPseudoTernaryW_VV_VX;
-defm PseudoVWMACCSU    : VPseudoTernaryW_VV_VX;
-defm PseudoVWMACCUS    : VPseudoTernaryW_VX;
+defm PseudoVWMACCU  : VPseudoVWMAC_VV_VX;
+defm PseudoVWMACC   : VPseudoVWMAC_VV_VX;
+defm PseudoVWMACCSU : VPseudoVWMAC_VV_VX;
+defm PseudoVWMACCUS : VPseudoVWMAC_VX;
 
 //===----------------------------------------------------------------------===//
 // 12.15. Vector Integer Merge Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVMERGE      : VPseudoBinaryV_VM_XM_IM;
+defm PseudoVMERGE : VPseudoVMRG_VM_XM_IM;
 
 //===----------------------------------------------------------------------===//
 // 12.16. Vector Integer Move Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVMV_V       : VPseudoUnaryV_V_X_I_NoDummyMask;
+defm PseudoVMV_V : VPseudoUnaryVMV_V_X_I;
 
 //===----------------------------------------------------------------------===//
 // 13.1. Vector Single-Width Saturating Add and Subtract
 //===----------------------------------------------------------------------===//
 let Defs = [VXSAT], hasSideEffects = 1 in {
-  defm PseudoVSADDU      : VPseudoBinaryV_VV_VX_VI;
-  defm PseudoVSADD       : VPseudoBinaryV_VV_VX_VI;
-  defm PseudoVSSUBU      : VPseudoBinaryV_VV_VX;
-  defm PseudoVSSUB       : VPseudoBinaryV_VV_VX;
+  defm PseudoVSADDU : VPseudoVSALU_VV_VX_VI;
+  defm PseudoVSADD  : VPseudoVSALU_VV_VX_VI;
+  defm PseudoVSSUBU : VPseudoVSALU_VV_VX;
+  defm PseudoVSSUB  : VPseudoVSALU_VV_VX;
 }
 
 //===----------------------------------------------------------------------===//
 // 13.2. Vector Single-Width Averaging Add and Subtract
 //===----------------------------------------------------------------------===//
 let Uses = [VXRM], hasSideEffects = 1 in {
-  defm PseudoVAADDU      : VPseudoBinaryV_VV_VX;
-  defm PseudoVAADD       : VPseudoBinaryV_VV_VX;
-  defm PseudoVASUBU      : VPseudoBinaryV_VV_VX;
-  defm PseudoVASUB       : VPseudoBinaryV_VV_VX;
+  defm PseudoVAADDU : VPseudoVAALU_VV_VX;
+  defm PseudoVAADD  : VPseudoVAALU_VV_VX;
+  defm PseudoVASUBU : VPseudoVAALU_VV_VX;
+  defm PseudoVASUB  : VPseudoVAALU_VV_VX;
 }
 
 //===----------------------------------------------------------------------===//
 // 13.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
 //===----------------------------------------------------------------------===//
 let Uses = [VXRM], Defs = [VXSAT], hasSideEffects = 1 in {
-  defm PseudoVSMUL      : VPseudoBinaryV_VV_VX;
+  defm PseudoVSMUL : VPseudoVSMUL_VV_VX;
 }
 
 //===----------------------------------------------------------------------===//
 // 13.4. Vector Single-Width Scaling Shift Instructions
 //===----------------------------------------------------------------------===//
 let Uses = [VXRM], hasSideEffects = 1 in {
-  defm PseudoVSSRL        : VPseudoBinaryV_VV_VX_VI<uimm5>;
-  defm PseudoVSSRA        : VPseudoBinaryV_VV_VX_VI<uimm5>;
+  defm PseudoVSSRL : VPseudoVSSHT_VV_VX_VI<uimm5>;
+  defm PseudoVSSRA : VPseudoVSSHT_VV_VX_VI<uimm5>;
 }
 
 //===----------------------------------------------------------------------===//
 // 13.5. Vector Narrowing Fixed-Point Clip Instructions
 //===----------------------------------------------------------------------===//
 let Uses = [VXRM], Defs = [VXSAT], hasSideEffects = 1 in {
-  defm PseudoVNCLIP     : VPseudoBinaryV_WV_WX_WI;
-  defm PseudoVNCLIPU    : VPseudoBinaryV_WV_WX_WI;
+  defm PseudoVNCLIP  : VPseudoVNCLP_WV_WX_WI;
+  defm PseudoVNCLIPU : VPseudoVNCLP_WV_WX_WI;
 }
 
 } // Predicates = [HasVInstructions]
@@ -3825,156 +4062,156 @@ let Predicates = [HasVInstructionsAnyF] in {
 //===----------------------------------------------------------------------===//
 // 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFADD       : VPseudoBinaryV_VV_VF;
-defm PseudoVFSUB       : VPseudoBinaryV_VV_VF;
-defm PseudoVFRSUB      : VPseudoBinaryV_VF;
+defm PseudoVFADD  : VPseudoVALU_VV_VF;
+defm PseudoVFSUB  : VPseudoVALU_VV_VF;
+defm PseudoVFRSUB : VPseudoVALU_VF;
 
 //===----------------------------------------------------------------------===//
 // 14.3. Vector Widening Floating-Point Add/Subtract Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFWADD     : VPseudoBinaryW_VV_VF;
-defm PseudoVFWSUB     : VPseudoBinaryW_VV_VF;
-defm PseudoVFWADD     : VPseudoBinaryW_WV_WF;
-defm PseudoVFWSUB     : VPseudoBinaryW_WV_WF;
+defm PseudoVFWADD : VPseudoVFWALU_VV_VF;
+defm PseudoVFWSUB : VPseudoVFWALU_VV_VF;
+defm PseudoVFWADD : VPseudoVFWALU_WV_WF;
+defm PseudoVFWSUB : VPseudoVFWALU_WV_WF;
 
 //===----------------------------------------------------------------------===//
 // 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFMUL       : VPseudoBinaryV_VV_VF;
-defm PseudoVFDIV       : VPseudoBinaryV_VV_VF;
-defm PseudoVFRDIV      : VPseudoBinaryV_VF;
+defm PseudoVFMUL  : VPseudoVFMUL_VV_VF;
+defm PseudoVFDIV  : VPseudoVFDIV_VV_VF;
+defm PseudoVFRDIV : VPseudoVFRDIV_VF;
 
 //===----------------------------------------------------------------------===//
 // 14.5. Vector Widening Floating-Point Multiply
 //===----------------------------------------------------------------------===//
-defm PseudoVFWMUL      : VPseudoBinaryW_VV_VF;
+defm PseudoVFWMUL : VPseudoVWMUL_VV_VF;
 
 //===----------------------------------------------------------------------===//
 // 14.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFMACC      : VPseudoTernaryV_VV_VF_AAXA;
-defm PseudoVFNMACC     : VPseudoTernaryV_VV_VF_AAXA;
-defm PseudoVFMSAC      : VPseudoTernaryV_VV_VF_AAXA;
-defm PseudoVFNMSAC     : VPseudoTernaryV_VV_VF_AAXA;
-defm PseudoVFMADD      : VPseudoTernaryV_VV_VF_AAXA;
-defm PseudoVFNMADD     : VPseudoTernaryV_VV_VF_AAXA;
-defm PseudoVFMSUB      : VPseudoTernaryV_VV_VF_AAXA;
-defm PseudoVFNMSUB     : VPseudoTernaryV_VV_VF_AAXA;
+defm PseudoVFMACC  : VPseudoVMAC_VV_VF_AAXA;
+defm PseudoVFNMACC : VPseudoVMAC_VV_VF_AAXA;
+defm PseudoVFMSAC  : VPseudoVMAC_VV_VF_AAXA;
+defm PseudoVFNMSAC : VPseudoVMAC_VV_VF_AAXA;
+defm PseudoVFMADD  : VPseudoVMAC_VV_VF_AAXA;
+defm PseudoVFNMADD : VPseudoVMAC_VV_VF_AAXA;
+defm PseudoVFMSUB  : VPseudoVMAC_VV_VF_AAXA;
+defm PseudoVFNMSUB : VPseudoVMAC_VV_VF_AAXA;
 
 //===----------------------------------------------------------------------===//
 // 14.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFWMACC     : VPseudoTernaryW_VV_VF;
-defm PseudoVFWNMACC    : VPseudoTernaryW_VV_VF;
-defm PseudoVFWMSAC     : VPseudoTernaryW_VV_VF;
-defm PseudoVFWNMSAC    : VPseudoTernaryW_VV_VF;
+defm PseudoVFWMACC  : VPseudoVWMAC_VV_VF;
+defm PseudoVFWNMACC : VPseudoVWMAC_VV_VF;
+defm PseudoVFWMSAC  : VPseudoVWMAC_VV_VF;
+defm PseudoVFWNMSAC : VPseudoVWMAC_VV_VF;
 
 //===----------------------------------------------------------------------===//
 // 14.8. Vector Floating-Point Square-Root Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVFSQRT      : VPseudoUnaryTAV_V;
+defm PseudoVFSQRT : VPseudoVSQR_V;
 
 //===----------------------------------------------------------------------===//
 // 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVFRSQRT7    : VPseudoUnaryTAV_V;
+defm PseudoVFRSQRT7 : VPseudoVRCP_V;
 
 //===----------------------------------------------------------------------===//
 // 14.10. Vector Floating-Point Reciprocal Estimate Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVFREC7      : VPseudoUnaryTAV_V;
+defm PseudoVFREC7 : VPseudoVRCP_V;
 
 //===----------------------------------------------------------------------===//
 // 14.11. Vector Floating-Point Min/Max Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFMIN       : VPseudoBinaryV_VV_VF;
-defm PseudoVFMAX       : VPseudoBinaryV_VV_VF;
+defm PseudoVFMIN : VPseudoVMAX_VV_VF;
+defm PseudoVFMAX : VPseudoVMAX_VV_VF;
 
 //===----------------------------------------------------------------------===//
 // 14.12. Vector Floating-Point Sign-Injection Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFSGNJ      : VPseudoBinaryV_VV_VF;
-defm PseudoVFSGNJN     : VPseudoBinaryV_VV_VF;
-defm PseudoVFSGNJX     : VPseudoBinaryV_VV_VF;
+defm PseudoVFSGNJ  : VPseudoVSGNJ_VV_VF;
+defm PseudoVFSGNJN : VPseudoVSGNJ_VV_VF;
+defm PseudoVFSGNJX : VPseudoVSGNJ_VV_VF;
 
 //===----------------------------------------------------------------------===//
 // 14.13. Vector Floating-Point Compare Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVMFEQ       : VPseudoBinaryM_VV_VF;
-defm PseudoVMFNE       : VPseudoBinaryM_VV_VF;
-defm PseudoVMFLT       : VPseudoBinaryM_VV_VF;
-defm PseudoVMFLE       : VPseudoBinaryM_VV_VF;
-defm PseudoVMFGT       : VPseudoBinaryM_VF;
-defm PseudoVMFGE       : VPseudoBinaryM_VF;
+defm PseudoVMFEQ : VPseudoVCMPM_VV_VF;
+defm PseudoVMFNE : VPseudoVCMPM_VV_VF;
+defm PseudoVMFLT : VPseudoVCMPM_VV_VF;
+defm PseudoVMFLE : VPseudoVCMPM_VV_VF;
+defm PseudoVMFGT : VPseudoVCMPM_VF;
+defm PseudoVMFGE : VPseudoVCMPM_VF;
 
 //===----------------------------------------------------------------------===//
 // 14.14. Vector Floating-Point Classify Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVFCLASS     : VPseudoUnaryV_V;
+defm PseudoVFCLASS : VPseudoVCLS_V;
 
 //===----------------------------------------------------------------------===//
 // 14.15. Vector Floating-Point Merge Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVFMERGE     : VPseudoBinaryV_FM;
+defm PseudoVFMERGE : VPseudoVMRG_FM;
 
 //===----------------------------------------------------------------------===//
 // 14.16. Vector Floating-Point Move Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVFMV_V      : VPseudoUnaryV_F_NoDummyMask;
+defm PseudoVFMV_V : VPseudoVMV_F;
 
 //===----------------------------------------------------------------------===//
 // 14.17. Single-Width Floating-Point/Integer Type-Convert Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFCVT_XU_F : VPseudoConversionV_V;
-defm PseudoVFCVT_X_F : VPseudoConversionV_V;
-defm PseudoVFCVT_RTZ_XU_F : VPseudoConversionV_V;
-defm PseudoVFCVT_RTZ_X_F : VPseudoConversionV_V;
-defm PseudoVFCVT_F_XU : VPseudoConversionV_V;
-defm PseudoVFCVT_F_X : VPseudoConversionV_V;
+defm PseudoVFCVT_XU_F : VPseudoVCVTI_V;
+defm PseudoVFCVT_X_F : VPseudoVCVTI_V;
+defm PseudoVFCVT_RTZ_XU_F : VPseudoVCVTI_V;
+defm PseudoVFCVT_RTZ_X_F : VPseudoVCVTI_V;
+defm PseudoVFCVT_F_XU : VPseudoVCVTF_V;
+defm PseudoVFCVT_F_X : VPseudoVCVTF_V;
 
 //===----------------------------------------------------------------------===//
 // 14.18. Widening Floating-Point/Integer Type-Convert Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFWCVT_XU_F : VPseudoConversionW_V;
-defm PseudoVFWCVT_X_F : VPseudoConversionW_V;
-defm PseudoVFWCVT_RTZ_XU_F : VPseudoConversionW_V;
-defm PseudoVFWCVT_RTZ_X_F : VPseudoConversionW_V;
-defm PseudoVFWCVT_F_XU : VPseudoConversionW_V;
-defm PseudoVFWCVT_F_X : VPseudoConversionW_V;
-defm PseudoVFWCVT_F_F : VPseudoConversionW_V;
+defm PseudoVFWCVT_XU_F     : VPseudoVWCVTI_V;
+defm PseudoVFWCVT_X_F      : VPseudoVWCVTI_V;
+defm PseudoVFWCVT_RTZ_XU_F : VPseudoVWCVTI_V;
+defm PseudoVFWCVT_RTZ_X_F  : VPseudoVWCVTI_V;
+defm PseudoVFWCVT_F_XU     : VPseudoVWCVTF_V;
+defm PseudoVFWCVT_F_X      : VPseudoVWCVTF_V;
+defm PseudoVFWCVT_F_F      : VPseudoVWCVTD_V;
 
 //===----------------------------------------------------------------------===//
 // 14.19. Narrowing Floating-Point/Integer Type-Convert Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFNCVT_XU_F : VPseudoConversionV_W;
-defm PseudoVFNCVT_X_F : VPseudoConversionV_W;
-defm PseudoVFNCVT_RTZ_XU_F : VPseudoConversionV_W;
-defm PseudoVFNCVT_RTZ_X_F : VPseudoConversionV_W;
-defm PseudoVFNCVT_F_XU : VPseudoConversionV_W;
-defm PseudoVFNCVT_F_X : VPseudoConversionV_W;
-defm PseudoVFNCVT_F_F : VPseudoConversionV_W;
-defm PseudoVFNCVT_ROD_F_F : VPseudoConversionV_W;
+defm PseudoVFNCVT_XU_F     : VPseudoVNCVTI_W;
+defm PseudoVFNCVT_X_F      : VPseudoVNCVTI_W;
+defm PseudoVFNCVT_RTZ_XU_F : VPseudoVNCVTI_W;
+defm PseudoVFNCVT_RTZ_X_F  : VPseudoVNCVTI_W;
+defm PseudoVFNCVT_F_XU     : VPseudoVNCVTF_W;
+defm PseudoVFNCVT_F_X      : VPseudoVNCVTF_W;
+defm PseudoVFNCVT_F_F      : VPseudoVNCVTD_W;
+defm PseudoVFNCVT_ROD_F_F  : VPseudoVNCVTD_W;
 } // Predicates = [HasVInstructionsAnyF]
 
 let Predicates = [HasVInstructions] in {
 //===----------------------------------------------------------------------===//
 // 15.1. Vector Single-Width Integer Reduction Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVREDSUM     : VPseudoReductionV_VS;
-defm PseudoVREDAND     : VPseudoReductionV_VS;
-defm PseudoVREDOR      : VPseudoReductionV_VS;
-defm PseudoVREDXOR     : VPseudoReductionV_VS;
-defm PseudoVREDMINU    : VPseudoReductionV_VS;
-defm PseudoVREDMIN     : VPseudoReductionV_VS;
-defm PseudoVREDMAXU    : VPseudoReductionV_VS;
-defm PseudoVREDMAX     : VPseudoReductionV_VS;
+defm PseudoVREDSUM  : VPseudoVRED_VS;
+defm PseudoVREDAND  : VPseudoVRED_VS;
+defm PseudoVREDOR   : VPseudoVRED_VS;
+defm PseudoVREDXOR  : VPseudoVRED_VS;
+defm PseudoVREDMINU : VPseudoVRED_VS;
+defm PseudoVREDMIN  : VPseudoVRED_VS;
+defm PseudoVREDMAXU : VPseudoVRED_VS;
+defm PseudoVREDMAX  : VPseudoVRED_VS;
 
 //===----------------------------------------------------------------------===//
 // 15.2. Vector Widening Integer Reduction Instructions
 //===----------------------------------------------------------------------===//
 let IsRVVWideningReduction = 1 in {
-defm PseudoVWREDSUMU   : VPseudoReductionV_VS;
-defm PseudoVWREDSUM    : VPseudoReductionV_VS;
+defm PseudoVWREDSUMU   : VPseudoVWRED_VS;
+defm PseudoVWREDSUM    : VPseudoVWRED_VS;
 }
 } // Predicates = [HasVInstructions]
 
@@ -3982,17 +4219,17 @@ let Predicates = [HasVInstructionsAnyF] in {
 //===----------------------------------------------------------------------===//
 // 15.3. Vector Single-Width Floating-Point Reduction Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVFREDOSUM   : VPseudoReductionV_VS;
-defm PseudoVFREDUSUM   : VPseudoReductionV_VS;
-defm PseudoVFREDMIN    : VPseudoReductionV_VS;
-defm PseudoVFREDMAX    : VPseudoReductionV_VS;
+defm PseudoVFREDOSUM : VPseudoVFREDO_VS;
+defm PseudoVFREDUSUM : VPseudoVFRED_VS;
+defm PseudoVFREDMIN  : VPseudoVFRED_VS;
+defm PseudoVFREDMAX  : VPseudoVFRED_VS;
 
 //===----------------------------------------------------------------------===//
 // 15.4. Vector Widening Floating-Point Reduction Instructions
 //===----------------------------------------------------------------------===//
 let IsRVVWideningReduction = 1 in {
-defm PseudoVFWREDUSUM  : VPseudoReductionV_VS;
-defm PseudoVFWREDOSUM  : VPseudoReductionV_VS;
+defm PseudoVFWREDUSUM  : VPseudoVFWRED_VS;
+defm PseudoVFWREDOSUM  : VPseudoVFWRED_VS;
 }
 
 } // Predicates = [HasVInstructionsAnyF]
@@ -4005,55 +4242,57 @@ defm PseudoVFWREDOSUM  : VPseudoReductionV_VS;
 // 16.1 Vector Mask-Register Logical Instructions
 //===----------------------------------------------------------------------===//
 
-defm PseudoVMAND: VPseudoBinaryM_MM;
-defm PseudoVMNAND: VPseudoBinaryM_MM;
-defm PseudoVMANDN: VPseudoBinaryM_MM;
-defm PseudoVMXOR: VPseudoBinaryM_MM;
-defm PseudoVMOR: VPseudoBinaryM_MM;
-defm PseudoVMNOR: VPseudoBinaryM_MM;
-defm PseudoVMORN: VPseudoBinaryM_MM;
-defm PseudoVMXNOR: VPseudoBinaryM_MM;
+defm PseudoVMAND: VPseudoVALU_MM;
+defm PseudoVMNAND: VPseudoVALU_MM;
+defm PseudoVMANDN: VPseudoVALU_MM;
+defm PseudoVMXOR: VPseudoVALU_MM;
+defm PseudoVMOR: VPseudoVALU_MM;
+defm PseudoVMNOR: VPseudoVALU_MM;
+defm PseudoVMORN: VPseudoVALU_MM;
+defm PseudoVMXNOR: VPseudoVALU_MM;
 
 // Pseudo instructions
-defm PseudoVMCLR : VPseudoNullaryPseudoM<"VMXOR">;
-defm PseudoVMSET : VPseudoNullaryPseudoM<"VMXNOR">;
+defm PseudoVMCLR : VPseudoNullaryPseudoM<"VMXOR">,
+                   Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>;
+defm PseudoVMSET : VPseudoNullaryPseudoM<"VMXNOR">,
+                   Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>;
 
 //===----------------------------------------------------------------------===//
 // 16.2. Vector mask population count vcpop
 //===----------------------------------------------------------------------===//
 
-defm PseudoVCPOP: VPseudoUnaryS_M;
+defm PseudoVCPOP: VPseudoVPOP_M;
 
 //===----------------------------------------------------------------------===//
 // 16.3. vfirst find-first-set mask bit
 //===----------------------------------------------------------------------===//
 
-defm PseudoVFIRST: VPseudoUnaryS_M;
+defm PseudoVFIRST: VPseudoV1ST_M;
 
 //===----------------------------------------------------------------------===//
 // 16.4. vmsbf.m set-before-first mask bit
 //===----------------------------------------------------------------------===//
-defm PseudoVMSBF: VPseudoUnaryM_M;
+defm PseudoVMSBF: VPseudoVSFS_M;
 
 //===----------------------------------------------------------------------===//
 // 16.5. vmsif.m set-including-first mask bit
 //===----------------------------------------------------------------------===//
-defm PseudoVMSIF: VPseudoUnaryM_M;
+defm PseudoVMSIF: VPseudoVSFS_M;
 
 //===----------------------------------------------------------------------===//
 // 16.6. vmsof.m set-only-first mask bit
 //===----------------------------------------------------------------------===//
-defm PseudoVMSOF: VPseudoUnaryM_M;
+defm PseudoVMSOF: VPseudoVSFS_M;
 
 //===----------------------------------------------------------------------===//
 // 16.8.  Vector Iota Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVIOTA_M: VPseudoUnaryV_M;
+defm PseudoVIOTA_M: VPseudoVIOT_M;
 
 //===----------------------------------------------------------------------===//
 // 16.9. Vector Element Index Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVID : VPseudoMaskNullaryV;
+defm PseudoVID : VPseudoVID_V;
 
 //===----------------------------------------------------------------------===//
 // 17. Vector Permutation Instructions
@@ -4068,15 +4307,18 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
   foreach m = MxList.m in {
     let VLMul = m.value in {
       let HasSEWOp = 1, BaseInstr = VMV_X_S in
-      def PseudoVMV_X_S # "_" # m.MX: Pseudo<(outs GPR:$rd),
-                                             (ins m.vrclass:$rs2, ixlenimm:$sew),
-                                             []>, RISCVVPseudo;
+      def PseudoVMV_X_S # "_" # m.MX:
+        Pseudo<(outs GPR:$rd), (ins m.vrclass:$rs2, ixlenimm:$sew), []>,
+        Sched<[WriteVIMovVX, ReadVIMovVX]>,
+        RISCVVPseudo;
       let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X,
           Constraints = "$rd = $rs1" in
       def PseudoVMV_S_X # "_" # m.MX: Pseudo<(outs m.vrclass:$rd),
                                              (ins m.vrclass:$rs1, GPR:$rs2,
                                                   AVL:$vl, ixlenimm:$sew),
-                                             []>, RISCVVPseudo;
+                                             []>,
+        Sched<[WriteVIMovXV, ReadVIMovXV, ReadVIMovXX]>,
+        RISCVVPseudo;
     }
   }
 }
@@ -4093,17 +4335,19 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
       let VLMul = m.value in {
         let HasSEWOp = 1, BaseInstr = VFMV_F_S in
         def "PseudoVFMV_" # f.FX # "_S_" # m.MX :
-                                          Pseudo<(outs f.fprclass:$rd),
-                                                 (ins m.vrclass:$rs2,
-                                                      ixlenimm:$sew),
-                                                 []>, RISCVVPseudo;
+          Pseudo<(outs f.fprclass:$rd),
+                 (ins m.vrclass:$rs2, ixlenimm:$sew), []>,
+          Sched<[WriteVFMovVF, ReadVFMovVF]>,
+          RISCVVPseudo;
         let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F,
             Constraints = "$rd = $rs1" in
         def "PseudoVFMV_S_" # f.FX # "_" # m.MX :
                                           Pseudo<(outs m.vrclass:$rd),
                                                  (ins m.vrclass:$rs1, f.fprclass:$rs2,
                                                       AVL:$vl, ixlenimm:$sew),
-                                                 []>, RISCVVPseudo;
+                                                 []>,
+          Sched<[WriteVFMovFV, ReadVFMovFV, ReadVFMovFX]>,
+          RISCVVPseudo;
       }
     }
   }
@@ -4114,52 +4358,33 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 // 17.3. Vector Slide Instructions
 //===----------------------------------------------------------------------===//
 let Predicates = [HasVInstructions] in {
-  defm PseudoVSLIDEUP    : VPseudoTernaryV_VX_VI<uimm5, "@earlyclobber $rd">;
-  defm PseudoVSLIDEDOWN  : VPseudoTernaryV_VX_VI<uimm5>;
-  defm PseudoVSLIDE1UP   : VPseudoBinaryV_VX<"@earlyclobber $rd">;
-  defm PseudoVSLIDE1DOWN : VPseudoBinaryV_VX;
+  defm PseudoVSLIDEUP    : VPseudoVSLD_VX_VI<uimm5, "@earlyclobber $rd">;
+  defm PseudoVSLIDEDOWN  : VPseudoVSLD_VX_VI<uimm5>;
+  defm PseudoVSLIDE1UP   : VPseudoVSLD1_VX<"@earlyclobber $rd">;
+  defm PseudoVSLIDE1DOWN : VPseudoVSLD1_VX;
 } // Predicates = [HasVInstructions]
 
 let Predicates = [HasVInstructionsAnyF] in {
-  defm PseudoVFSLIDE1UP  : VPseudoBinaryV_VF<"@earlyclobber $rd">;
-  defm PseudoVFSLIDE1DOWN : VPseudoBinaryV_VF;
+  defm PseudoVFSLIDE1UP  : VPseudoVSLD1_VF<"@earlyclobber $rd">;
+  defm PseudoVFSLIDE1DOWN : VPseudoVSLD1_VF;
 } // Predicates = [HasVInstructionsAnyF]
 
 //===----------------------------------------------------------------------===//
 // 17.4. Vector Register Gather Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVRGATHER    : VPseudoBinaryV_VV_VX_VI<uimm5, "@earlyclobber $rd">;
-defm PseudoVRGATHEREI16 : VPseudoBinaryV_VV_EEW</* eew */ 16, "@earlyclobber $rd">;
+defm PseudoVRGATHER     : VPseudoVGTR_VV_VX_VI<uimm5, "@earlyclobber $rd">;
+defm PseudoVRGATHEREI16 : VPseudoVGTR_VV_EEW</* eew */ 16, "@earlyclobber $rd">;
 
 //===----------------------------------------------------------------------===//
 // 17.5. Vector Compress Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVCOMPRESS : VPseudoUnaryV_V_AnyMask;
+defm PseudoVCOMPRESS : VPseudoVCPR_V;
 
 //===----------------------------------------------------------------------===//
 // Patterns.
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// 8. Vector AMO Operations
-//===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZvamo] in {
-  defm : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllIntegerVectors>;
-  defm : VPatAMOV_WD<"int_riscv_vamoadd", "PseudoVAMOADD", AllIntegerVectors>;
-  defm : VPatAMOV_WD<"int_riscv_vamoxor", "PseudoVAMOXOR", AllIntegerVectors>;
-  defm : VPatAMOV_WD<"int_riscv_vamoand", "PseudoVAMOAND", AllIntegerVectors>;
-  defm : VPatAMOV_WD<"int_riscv_vamoor", "PseudoVAMOOR", AllIntegerVectors>;
-  defm : VPatAMOV_WD<"int_riscv_vamomin", "PseudoVAMOMIN", AllIntegerVectors>;
-  defm : VPatAMOV_WD<"int_riscv_vamomax", "PseudoVAMOMAX", AllIntegerVectors>;
-  defm : VPatAMOV_WD<"int_riscv_vamominu", "PseudoVAMOMINU", AllIntegerVectors>;
-  defm : VPatAMOV_WD<"int_riscv_vamomaxu", "PseudoVAMOMAXU", AllIntegerVectors>;
-} // Predicates = [HasStdExtZvamo]
-
-let Predicates = [HasStdExtZvamo, HasVInstructionsAnyF] in {
-  defm : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllFloatVectors>;
-} // Predicates = [HasStdExtZvamo, HasVInstructionsAnyF]
-
-//===----------------------------------------------------------------------===//
 // 12. Vector Integer Arithmetic Instructions
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 461bdd348934..7eb8ae7d4193 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -382,50 +382,50 @@ def FSRI : RVBTernaryImm6<0b101, OPC_OP_IMM, "fsri",
 } // Predicates = [HasStdExtZbt]
 
 let Predicates = [HasStdExtZbb] in {
-def CLZ  : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0010011>, "clz">,
+def CLZ  : RVBUnary<0b0110000, 0b00000, 0b001, OPC_OP_IMM, "clz">,
            Sched<[WriteCLZ, ReadCLZ]>;
-def CTZ  : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0010011>, "ctz">,
+def CTZ  : RVBUnary<0b0110000, 0b00001, 0b001, OPC_OP_IMM, "ctz">,
            Sched<[WriteCTZ, ReadCTZ]>;
-def CPOP : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0010011>, "cpop">,
+def CPOP : RVBUnary<0b0110000, 0b00010, 0b001, OPC_OP_IMM, "cpop">,
            Sched<[WriteCPOP, ReadCPOP]>;
 } // Predicates = [HasStdExtZbb]
 
 let Predicates = [HasStdExtZbm, IsRV64] in
-def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, RISCVOpcode<0b0010011>,
-                        "bmatflip">, Sched<[]>;
+def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, OPC_OP_IMM, "bmatflip">,
+               Sched<[]>;
 
 let Predicates = [HasStdExtZbb] in {
-def SEXTB : RVBUnary<0b0110000, 0b00100, 0b001, RISCVOpcode<0b0010011>,
-                     "sext.b">, Sched<[WriteIALU, ReadIALU]>;
-def SEXTH : RVBUnary<0b0110000, 0b00101, 0b001, RISCVOpcode<0b0010011>,
-                     "sext.h">, Sched<[WriteIALU, ReadIALU]>;
+def SEXTB : RVBUnary<0b0110000, 0b00100, 0b001, OPC_OP_IMM, "sext.b">,
+            Sched<[WriteIALU, ReadIALU]>;
+def SEXTH : RVBUnary<0b0110000, 0b00101, 0b001, OPC_OP_IMM, "sext.h">,
+            Sched<[WriteIALU, ReadIALU]>;
 } // Predicates = [HasStdExtZbb]
 
 let Predicates = [HasStdExtZbr] in {
-def CRC32B : RVBUnary<0b0110000, 0b10000, 0b001, RISCVOpcode<0b0010011>,
-                      "crc32.b">, Sched<[]>;
-def CRC32H : RVBUnary<0b0110000, 0b10001, 0b001, RISCVOpcode<0b0010011>,
-                      "crc32.h">, Sched<[]>;
-def CRC32W : RVBUnary<0b0110000, 0b10010, 0b001, RISCVOpcode<0b0010011>,
-                      "crc32.w">, Sched<[]>;
+def CRC32B : RVBUnary<0b0110000, 0b10000, 0b001, OPC_OP_IMM, "crc32.b">,
+             Sched<[]>;
+def CRC32H : RVBUnary<0b0110000, 0b10001, 0b001, OPC_OP_IMM, "crc32.h">,
+             Sched<[]>;
+def CRC32W : RVBUnary<0b0110000, 0b10010, 0b001, OPC_OP_IMM, "crc32.w">,
+             Sched<[]>;
 } // Predicates = [HasStdExtZbr]
 
 let Predicates = [HasStdExtZbr, IsRV64] in
-def CRC32D  : RVBUnary<0b0110000, 0b10011, 0b001, RISCVOpcode<0b0010011>,
-                       "crc32.d">, Sched<[]>;
+def CRC32D  : RVBUnary<0b0110000, 0b10011, 0b001, OPC_OP_IMM, "crc32.d">,
+              Sched<[]>;
 
 let Predicates = [HasStdExtZbr] in {
-def CRC32CB : RVBUnary<0b0110000, 0b11000, 0b001, RISCVOpcode<0b0010011>,
-                       "crc32c.b">, Sched<[]>;
-def CRC32CH : RVBUnary<0b0110000, 0b11001, 0b001, RISCVOpcode<0b0010011>,
-                       "crc32c.h">, Sched<[]>;
-def CRC32CW : RVBUnary<0b0110000, 0b11010, 0b001, RISCVOpcode<0b0010011>,
-                       "crc32c.w">, Sched<[]>;
+def CRC32CB : RVBUnary<0b0110000, 0b11000, 0b001, OPC_OP_IMM, "crc32c.b">,
+              Sched<[]>;
+def CRC32CH : RVBUnary<0b0110000, 0b11001, 0b001, OPC_OP_IMM, "crc32c.h">,
+              Sched<[]>;
+def CRC32CW : RVBUnary<0b0110000, 0b11010, 0b001, OPC_OP_IMM, "crc32c.w">,
+              Sched<[]>;
 } // Predicates = [HasStdExtZbr]
 
 let Predicates = [HasStdExtZbr, IsRV64] in
-def CRC32CD : RVBUnary<0b0110000, 0b11011, 0b001, RISCVOpcode<0b0010011>,
-                       "crc32c.d">, Sched<[]>;
+def CRC32CD : RVBUnary<0b0110000, 0b11011, 0b001, OPC_OP_IMM, "crc32c.d">,
+              Sched<[]>;
 
 let Predicates = [HasStdExtZbc] in {
 def CLMUL  : ALU_rr<0b0000101, 0b001, "clmul">, Sched<[]>;
@@ -523,12 +523,12 @@ def FSRIW : RVBTernaryImm5<0b10, 0b101, OPC_OP_IMM_32,
 } // Predicates = [HasStdExtZbt, IsRV64]
 
 let Predicates = [HasStdExtZbb, IsRV64] in {
-def CLZW   : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0011011>,
-                      "clzw">, Sched<[WriteCLZ32, ReadCLZ32]>;
-def CTZW   : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0011011>,
-                      "ctzw">, Sched<[WriteCTZ32, ReadCTZ32]>;
-def CPOPW  : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0011011>,
-                      "cpopw">, Sched<[WriteCPOP32, ReadCPOP32]>;
+def CLZW   : RVBUnary<0b0110000, 0b00000, 0b001, OPC_OP_IMM_32, "clzw">,
+             Sched<[WriteCLZ32, ReadCLZ32]>;
+def CTZW   : RVBUnary<0b0110000, 0b00001, 0b001, OPC_OP_IMM_32, "ctzw">,
+             Sched<[WriteCTZ32, ReadCTZ32]>;
+def CPOPW  : RVBUnary<0b0110000, 0b00010, 0b001, OPC_OP_IMM_32, "cpopw">,
+             Sched<[WriteCPOP32, ReadCPOP32]>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
 let Predicates = [HasStdExtZbp, IsRV64] in {
@@ -791,6 +791,9 @@ def : Pat<(xor GPR:$rs1, BSETINVMask:$mask),
 def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)),
           (BEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
 
+def : Pat<(and (not (srl GPR:$rs1, uimmlog2xlen:$shamt)), (XLenVT 1)),
+          (XORI (BEXTI GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1))>;
+
 def : Pat<(or GPR:$r, BSETINVTwoBitsMask:$i),
           (BSETI (BSETI GPR:$r, (TrailingZerosXForm BSETINVTwoBitsMask:$i)),
                  (BSETINVTwoBitsMaskHigh BSETINVTwoBitsMask:$i))>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index a33494461869..663e44813899 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -28,41 +28,6 @@ def riscv_fmv_x_anyexth
     : SDNode<"RISCVISD::FMV_X_ANYEXTH", SDT_RISCVFMV_X_ANYEXTH>;
 
 //===----------------------------------------------------------------------===//
-// Instruction class templates
-//===----------------------------------------------------------------------===//
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPFMAH_rrr_frm<RISCVOpcode opcode, string opcodestr>
-    : RVInstR4Frm<0b10, opcode, (outs FPR16:$rd),
-                  (ins FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, frmarg:$funct3),
-                  opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">;
-
-class FPFMAHDynFrmAlias<FPFMAH_rrr_frm Inst, string OpcodeStr>
-    : InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3",
-                (Inst FPR16:$rd, FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPALUH_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
-    : RVInstR<funct7, funct3, OPC_OP_FP, (outs FPR16:$rd),
-              (ins FPR16:$rs1, FPR16:$rs2), opcodestr, "$rd, $rs1, $rs2">;
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPALUH_rr_frm<bits<7> funct7, string opcodestr>
-    : RVInstRFrm<funct7, OPC_OP_FP, (outs FPR16:$rd),
-                 (ins FPR16:$rs1, FPR16:$rs2, frmarg:$funct3), opcodestr,
-                  "$rd, $rs1, $rs2, $funct3">;
-
-class FPALUHDynFrmAlias<FPALUH_rr_frm Inst, string OpcodeStr>
-    : InstAlias<OpcodeStr#" $rd, $rs1, $rs2",
-                (Inst FPR16:$rd, FPR16:$rs1, FPR16:$rs2, 0b111)>;
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class FPCmpH_rr<bits<3> funct3, string opcodestr>
-    : RVInstR<0b1010010, funct3, OPC_OP_FP, (outs GPR:$rd),
-              (ins FPR16:$rs1, FPR16:$rs2), opcodestr, "$rd, $rs1, $rs2">,
-      Sched<[WriteFCmp16, ReadFCmp16, ReadFCmp16]>;
-
-//===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
 
@@ -84,145 +49,120 @@ def FSH : RVInstS<0b001, OPC_STORE_FP, (outs),
 } // Predicates = [HasStdExtZfhmin]
 
 let Predicates = [HasStdExtZfh] in {
-def FMADD_H  : FPFMAH_rrr_frm<OPC_MADD, "fmadd.h">,
-               Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>;
-def          : FPFMAHDynFrmAlias<FMADD_H, "fmadd.h">;
-def FMSUB_H  : FPFMAH_rrr_frm<OPC_MSUB, "fmsub.h">,
-               Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>;
-def          : FPFMAHDynFrmAlias<FMSUB_H, "fmsub.h">;
-def FNMSUB_H : FPFMAH_rrr_frm<OPC_NMSUB, "fnmsub.h">,
-               Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>;
-def          : FPFMAHDynFrmAlias<FNMSUB_H, "fnmsub.h">;
-def FNMADD_H : FPFMAH_rrr_frm<OPC_NMADD, "fnmadd.h">,
-               Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>;
-def          : FPFMAHDynFrmAlias<FNMADD_H, "fnmadd.h">;
+let SchedRW = [WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16] in {
+def FMADD_H  : FPFMA_rrr_frm<OPC_MADD,  0b10, "fmadd.h",  FPR16>;
+def FMSUB_H  : FPFMA_rrr_frm<OPC_MSUB,  0b10, "fmsub.h",  FPR16>;
+def FNMSUB_H : FPFMA_rrr_frm<OPC_NMSUB, 0b10, "fnmsub.h", FPR16>;
+def FNMADD_H : FPFMA_rrr_frm<OPC_NMADD, 0b10, "fnmadd.h", FPR16>;
+}
 
-def FADD_H : FPALUH_rr_frm<0b0000010, "fadd.h">,
+def : FPFMADynFrmAlias<FMADD_H,  "fmadd.h",  FPR16>;
+def : FPFMADynFrmAlias<FMSUB_H,  "fmsub.h",  FPR16>;
+def : FPFMADynFrmAlias<FNMSUB_H, "fnmsub.h", FPR16>;
+def : FPFMADynFrmAlias<FNMADD_H, "fnmadd.h", FPR16>;
+
+def FADD_H : FPALU_rr_frm<0b0000010, "fadd.h", FPR16>,
              Sched<[WriteFALU16, ReadFALU16, ReadFALU16]>;
-def        : FPALUHDynFrmAlias<FADD_H, "fadd.h">;
-def FSUB_H : FPALUH_rr_frm<0b0000110, "fsub.h">,
+def FSUB_H : FPALU_rr_frm<0b0000110, "fsub.h", FPR16>,
              Sched<[WriteFALU16, ReadFALU16, ReadFALU16]>;
-def        : FPALUHDynFrmAlias<FSUB_H, "fsub.h">;
-def FMUL_H : FPALUH_rr_frm<0b0001010, "fmul.h">,
+def FMUL_H : FPALU_rr_frm<0b0001010, "fmul.h", FPR16>,
              Sched<[WriteFMul16, ReadFMul16, ReadFMul16]>;
-def        : FPALUHDynFrmAlias<FMUL_H, "fmul.h">;
-def FDIV_H : FPALUH_rr_frm<0b0001110, "fdiv.h">,
+def FDIV_H : FPALU_rr_frm<0b0001110, "fdiv.h", FPR16>,
              Sched<[WriteFDiv16, ReadFDiv16, ReadFDiv16]>;
-def        : FPALUHDynFrmAlias<FDIV_H, "fdiv.h">;
 
-def FSQRT_H : FPUnaryOp_r_frm<0b0101110, FPR16, FPR16, "fsqrt.h">,
-              Sched<[WriteFSqrt16, ReadFSqrt16]> {
-  let rs2 = 0b00000;
-}
+def        : FPALUDynFrmAlias<FADD_H, "fadd.h", FPR16>;
+def        : FPALUDynFrmAlias<FSUB_H, "fsub.h", FPR16>;
+def        : FPALUDynFrmAlias<FMUL_H, "fmul.h", FPR16>;
+def        : FPALUDynFrmAlias<FDIV_H, "fdiv.h", FPR16>;
+
+def FSQRT_H : FPUnaryOp_r_frm<0b0101110, 0b00000, FPR16, FPR16, "fsqrt.h">,
+              Sched<[WriteFSqrt16, ReadFSqrt16]>;
 def         : FPUnaryOpDynFrmAlias<FSQRT_H, "fsqrt.h", FPR16, FPR16>;
 
-def FSGNJ_H  : FPALUH_rr<0b0010010, 0b000, "fsgnj.h">,
-               Sched<[WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16]>;
-def FSGNJN_H : FPALUH_rr<0b0010010, 0b001, "fsgnjn.h">,
-               Sched<[WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16]>;
-def FSGNJX_H : FPALUH_rr<0b0010010, 0b010, "fsgnjx.h">,
-               Sched<[WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16]>;
-def FMIN_H   : FPALUH_rr<0b0010110, 0b000, "fmin.h">,
-               Sched<[WriteFMinMax16, ReadFMinMax16, ReadFMinMax16]>;
-def FMAX_H   : FPALUH_rr<0b0010110, 0b001, "fmax.h">,
-               Sched<[WriteFMinMax16, ReadFMinMax16, ReadFMinMax16]>;
+let SchedRW = [WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16],
+    mayRaiseFPException = 0 in {
+def FSGNJ_H  : FPALU_rr<0b0010010, 0b000, "fsgnj.h", FPR16>;
+def FSGNJN_H : FPALU_rr<0b0010010, 0b001, "fsgnjn.h", FPR16>;
+def FSGNJX_H : FPALU_rr<0b0010010, 0b010, "fsgnjx.h", FPR16>;
+}
 
-def FCVT_W_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.w.h">,
-               Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]> {
-  let rs2 = 0b00000;
+let SchedRW = [WriteFMinMax16, ReadFMinMax16, ReadFMinMax16] in {
+def FMIN_H   : FPALU_rr<0b0010110, 0b000, "fmin.h", FPR16>;
+def FMAX_H   : FPALU_rr<0b0010110, 0b001, "fmax.h", FPR16>;
 }
+
+def FCVT_W_H : FPUnaryOp_r_frm<0b1100010, 0b00000, GPR, FPR16, "fcvt.w.h">,
+               Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]>;
 def          : FPUnaryOpDynFrmAlias<FCVT_W_H, "fcvt.w.h", GPR, FPR16>;
 
-def FCVT_WU_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.wu.h">,
-                Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]> {
-  let rs2 = 0b00001;
-}
+def FCVT_WU_H : FPUnaryOp_r_frm<0b1100010, 0b00001, GPR, FPR16, "fcvt.wu.h">,
+                Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]>;
 def           : FPUnaryOpDynFrmAlias<FCVT_WU_H, "fcvt.wu.h", GPR, FPR16>;
 
-def FCVT_H_W : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.w">,
-               Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]> {
-  let rs2 = 0b00000;
-}
+def FCVT_H_W : FPUnaryOp_r_frm<0b1101010, 0b00000, FPR16, GPR, "fcvt.h.w">,
+               Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]>;
 def          : FPUnaryOpDynFrmAlias<FCVT_H_W, "fcvt.h.w", FPR16, GPR>;
 
-def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.wu">,
-                Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]> {
-  let rs2 = 0b00001;
-}
+def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, 0b00001, FPR16, GPR, "fcvt.h.wu">,
+                Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]>;
 def           : FPUnaryOpDynFrmAlias<FCVT_H_WU, "fcvt.h.wu", FPR16, GPR>;
 } // Predicates = [HasStdExtZfh]
 
 let Predicates = [HasStdExtZfhmin] in {
-def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, FPR16, FPR32, "fcvt.h.s">,
-               Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]> {
-  let rs2 = 0b00000;
-}
+def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, 0b00000, FPR16, FPR32, "fcvt.h.s">,
+               Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]>;
 def          : FPUnaryOpDynFrmAlias<FCVT_H_S, "fcvt.h.s", FPR16, FPR32>;
 
-def FCVT_S_H : FPUnaryOp_r<0b0100000, 0b000, FPR32, FPR16, "fcvt.s.h">,
-               Sched<[WriteFCvtF16ToF32, ReadFCvtF16ToF32]> {
-  let rs2 = 0b00010;
-}
+def FCVT_S_H : FPUnaryOp_r<0b0100000, 0b00010, 0b000, FPR32, FPR16, "fcvt.s.h">,
+               Sched<[WriteFCvtF16ToF32, ReadFCvtF16ToF32]>;
 
-def FMV_X_H : FPUnaryOp_r<0b1110010, 0b000, GPR, FPR16, "fmv.x.h">,
-              Sched<[WriteFMovF16ToI16, ReadFMovF16ToI16]> {
-  let rs2 = 0b00000;
-}
+let mayRaiseFPException = 0 in
+def FMV_X_H : FPUnaryOp_r<0b1110010, 0b00000, 0b000, GPR, FPR16, "fmv.x.h">,
+              Sched<[WriteFMovF16ToI16, ReadFMovF16ToI16]>;
 
-def FMV_H_X : FPUnaryOp_r<0b1111010, 0b000, FPR16, GPR, "fmv.h.x">,
-              Sched<[WriteFMovI16ToF16, ReadFMovI16ToF16]> {
-  let rs2 = 0b00000;
-}
+let mayRaiseFPException = 0 in
+def FMV_H_X : FPUnaryOp_r<0b1111010, 0b00000, 0b000, FPR16, GPR, "fmv.h.x">,
+              Sched<[WriteFMovI16ToF16, ReadFMovI16ToF16]>;
 } // Predicates = [HasStdExtZfhmin]
 
 let Predicates = [HasStdExtZfh] in {
-def FEQ_H : FPCmpH_rr<0b010, "feq.h">;
-def FLT_H : FPCmpH_rr<0b001, "flt.h">;
-def FLE_H : FPCmpH_rr<0b000, "fle.h">;
 
-def FCLASS_H : FPUnaryOp_r<0b1110010, 0b001, GPR, FPR16, "fclass.h">,
-               Sched<[WriteFClass16, ReadFClass16]> {
-  let rs2 = 0b00000;
+let SchedRW = [WriteFCmp16, ReadFCmp16, ReadFCmp16] in {
+def FEQ_H : FPCmp_rr<0b1010010, 0b010, "feq.h", FPR16>;
+def FLT_H : FPCmp_rr<0b1010010, 0b001, "flt.h", FPR16>;
+def FLE_H : FPCmp_rr<0b1010010, 0b000, "fle.h", FPR16>;
 }
+
+let mayRaiseFPException = 0 in
+def FCLASS_H : FPUnaryOp_r<0b1110010, 0b00000, 0b001, GPR, FPR16, "fclass.h">,
+               Sched<[WriteFClass16, ReadFClass16]>;
 } // Predicates = [HasStdExtZfh]
 
 let Predicates = [HasStdExtZfh, IsRV64] in {
-def FCVT_L_H  : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.l.h">,
-                Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]> {
-  let rs2 = 0b00010;
-}
+def FCVT_L_H  : FPUnaryOp_r_frm<0b1100010, 0b00010, GPR, FPR16, "fcvt.l.h">,
+                Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]>;
 def           : FPUnaryOpDynFrmAlias<FCVT_L_H, "fcvt.l.h", GPR, FPR16>;
 
-def FCVT_LU_H  : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.lu.h">,
-                 Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]> {
-  let rs2 = 0b00011;
-}
+def FCVT_LU_H  : FPUnaryOp_r_frm<0b1100010, 0b00011, GPR, FPR16, "fcvt.lu.h">,
+                 Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]>;
 def            : FPUnaryOpDynFrmAlias<FCVT_LU_H, "fcvt.lu.h", GPR, FPR16>;
 
-def FCVT_H_L : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.l">,
-               Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]> {
-  let rs2 = 0b00010;
-}
+def FCVT_H_L : FPUnaryOp_r_frm<0b1101010, 0b00010, FPR16, GPR, "fcvt.h.l">,
+               Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]>;
 def          : FPUnaryOpDynFrmAlias<FCVT_H_L, "fcvt.h.l", FPR16, GPR>;
 
-def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.lu">,
-                Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]> {
-  let rs2 = 0b00011;
-}
+def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, 0b00011, FPR16, GPR, "fcvt.h.lu">,
+                Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]>;
 def           : FPUnaryOpDynFrmAlias<FCVT_H_LU, "fcvt.h.lu", FPR16, GPR>;
 } // Predicates = [HasStdExtZfh, IsRV64]
 
 let Predicates = [HasStdExtZfhmin, HasStdExtD] in {
-def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, FPR16, FPR64, "fcvt.h.d">,
-               Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]> {
-  let rs2 = 0b00001;
-}
+def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, 0b00001, FPR16, FPR64, "fcvt.h.d">,
+               Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]>;
 def          : FPUnaryOpDynFrmAlias<FCVT_H_D, "fcvt.h.d", FPR16, FPR64>;
 
-def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b000, FPR64, FPR16, "fcvt.d.h">,
-               Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]> {
-  let rs2 = 0b00010;
-}
+def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b00010, 0b000, FPR64, FPR16, "fcvt.d.h">,
+               Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]>;
 } // Predicates = [HasStdExtZfhmin, HasStdExtD]
 
 //===----------------------------------------------------------------------===//
@@ -275,12 +215,12 @@ def : Pat<(f16 (fpimm0)), (FMV_H_X X0)>;
 
 /// Float arithmetic operations
 
-def : PatFpr16Fpr16DynFrm<fadd, FADD_H>;
-def : PatFpr16Fpr16DynFrm<fsub, FSUB_H>;
-def : PatFpr16Fpr16DynFrm<fmul, FMUL_H>;
-def : PatFpr16Fpr16DynFrm<fdiv, FDIV_H>;
+def : PatFpr16Fpr16DynFrm<any_fadd, FADD_H>;
+def : PatFpr16Fpr16DynFrm<any_fsub, FSUB_H>;
+def : PatFpr16Fpr16DynFrm<any_fmul, FMUL_H>;
+def : PatFpr16Fpr16DynFrm<any_fdiv, FDIV_H>;
 
-def : Pat<(fsqrt FPR16:$rs1), (FSQRT_H FPR16:$rs1, 0b111)>;
+def : Pat<(any_fsqrt FPR16:$rs1), (FSQRT_H FPR16:$rs1, 0b111)>;
 
 def : Pat<(fneg FPR16:$rs1), (FSGNJN_H $rs1, $rs1)>;
 def : Pat<(fabs FPR16:$rs1), (FSGNJX_H $rs1, $rs1)>;
@@ -292,19 +232,19 @@ def : Pat<(fcopysign FPR16:$rs1, FPR32:$rs2),
 def : Pat<(fcopysign FPR32:$rs1, FPR16:$rs2), (FSGNJ_S $rs1, (FCVT_S_H $rs2))>;
 
 // fmadd: rs1 * rs2 + rs3
-def : Pat<(fma FPR16:$rs1, FPR16:$rs2, FPR16:$rs3),
+def : Pat<(any_fma FPR16:$rs1, FPR16:$rs2, FPR16:$rs3),
           (FMADD_H $rs1, $rs2, $rs3, 0b111)>;
 
 // fmsub: rs1 * rs2 - rs3
-def : Pat<(fma FPR16:$rs1, FPR16:$rs2, (fneg FPR16:$rs3)),
+def : Pat<(any_fma FPR16:$rs1, FPR16:$rs2, (fneg FPR16:$rs3)),
           (FMSUB_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
 
 // fnmsub: -rs1 * rs2 + rs3
-def : Pat<(fma (fneg FPR16:$rs1), FPR16:$rs2, FPR16:$rs3),
+def : Pat<(any_fma (fneg FPR16:$rs1), FPR16:$rs2, FPR16:$rs3),
           (FNMSUB_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
 
 // fnmadd: -rs1 * rs2 - rs3
-def : Pat<(fma (fneg FPR16:$rs1), FPR16:$rs2, (fneg FPR16:$rs3)),
+def : Pat<(any_fma (fneg FPR16:$rs1), FPR16:$rs2, (fneg FPR16:$rs3)),
           (FNMADD_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>;
 
 // The ratified 20191213 ISA spec defines fmin and fmax in a way that matches
@@ -337,8 +277,8 @@ defm : StPat<store, FSH, FPR16, f16>;
 /// Float conversion operations
 
 // f32 -> f16, f16 -> f32
-def : Pat<(fpround FPR32:$rs1), (FCVT_H_S FPR32:$rs1, 0b111)>;
-def : Pat<(fpextend FPR16:$rs1), (FCVT_S_H FPR16:$rs1)>;
+def : Pat<(any_fpround FPR32:$rs1), (FCVT_H_S FPR32:$rs1, 0b111)>;
+def : Pat<(any_fpextend FPR16:$rs1), (FCVT_S_H FPR16:$rs1)>;
 
 // Moves (no conversion)
 def : Pat<(riscv_fmv_h_x GPR:$src), (FMV_H_X GPR:$src)>;
@@ -347,8 +287,8 @@ def : Pat<(riscv_fmv_x_anyexth FPR16:$src), (FMV_X_H FPR16:$src)>;
 
 let Predicates = [HasStdExtZfh, IsRV32] in {
 // half->[u]int. Round-to-zero must be used.
-def : Pat<(i32 (fp_to_sint FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>;
-def : Pat<(i32 (fp_to_uint FPR16:$rs1)), (FCVT_WU_H $rs1, 0b001)>;
+def : Pat<(i32 (any_fp_to_sint FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>;
+def : Pat<(i32 (any_fp_to_uint FPR16:$rs1)), (FCVT_WU_H $rs1, 0b001)>;
 
 // Saturating float->[u]int32.
 def : Pat<(i32 (riscv_fcvt_x_rtz FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>;
@@ -361,20 +301,20 @@ def : Pat<(i32 (lrint FPR16:$rs1)), (FCVT_W_H $rs1, 0b111)>;
 def : Pat<(i32 (lround FPR16:$rs1)), (FCVT_W_H $rs1, 0b100)>;
 
 // [u]int->half. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp (i32 GPR:$rs1)), (FCVT_H_W $rs1, 0b111)>;
-def : Pat<(uint_to_fp (i32 GPR:$rs1)), (FCVT_H_WU $rs1, 0b111)>;
+def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_H_W $rs1, 0b111)>;
+def : Pat<(any_uint_to_fp (i32 GPR:$rs1)), (FCVT_H_WU $rs1, 0b111)>;
 } // Predicates = [HasStdExtZfh, IsRV32]
 
 let Predicates = [HasStdExtZfh, IsRV64] in {
 // Use target specific isd nodes to help us remember the result is sign
 // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
 // duplicated if it has another user that didn't need the sign_extend.
-def : Pat<(riscv_fcvt_w_rtz_rv64 FPR16:$rs1),  (FCVT_W_H $rs1, 0b001)>;
-def : Pat<(riscv_fcvt_wu_rtz_rv64 FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>;
+def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR16:$rs1),  (FCVT_W_H $rs1, 0b001)>;
+def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>;
 
 // half->[u]int64. Round-to-zero must be used.
-def : Pat<(i64 (fp_to_sint FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>;
-def : Pat<(i64 (fp_to_uint FPR16:$rs1)), (FCVT_LU_H $rs1, 0b001)>;
+def : Pat<(i64 (any_fp_to_sint FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>;
+def : Pat<(i64 (any_fp_to_uint FPR16:$rs1)), (FCVT_LU_H $rs1, 0b001)>;
 
 // Saturating float->[u]int64.
 def : Pat<(i64 (riscv_fcvt_x_rtz FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>;
@@ -389,17 +329,17 @@ def : Pat<(i64 (lround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>;
 def : Pat<(i64 (llround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>;
 
 // [u]int->fp. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_H_W $rs1, 0b111)>;
-def : Pat<(uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_H_WU $rs1, 0b111)>;
-def : Pat<(sint_to_fp (i64 GPR:$rs1)), (FCVT_H_L $rs1, 0b111)>;
-def : Pat<(uint_to_fp (i64 GPR:$rs1)), (FCVT_H_LU $rs1, 0b111)>;
+def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_H_W $rs1, 0b111)>;
+def : Pat<(any_uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_H_WU $rs1, 0b111)>;
+def : Pat<(any_sint_to_fp (i64 GPR:$rs1)), (FCVT_H_L $rs1, 0b111)>;
+def : Pat<(any_uint_to_fp (i64 GPR:$rs1)), (FCVT_H_LU $rs1, 0b111)>;
 } // Predicates = [HasStdExtZfh, IsRV64]
 
 let Predicates = [HasStdExtZfhmin, HasStdExtD] in {
 /// Float conversion operations
 // f64 -> f16, f16 -> f64
-def : Pat<(fpround FPR64:$rs1), (FCVT_H_D FPR64:$rs1, 0b111)>;
-def : Pat<(fpextend FPR16:$rs1), (FCVT_D_H FPR16:$rs1)>;
+def : Pat<(any_fpround FPR64:$rs1), (FCVT_H_D FPR64:$rs1, 0b111)>;
+def : Pat<(any_fpextend FPR16:$rs1), (FCVT_D_H FPR16:$rs1)>;
 
 /// Float arithmetic operations
 def : Pat<(fcopysign FPR16:$rs1, FPR64:$rs2),
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 798532d5bc44..9094dff1dda1 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -105,7 +105,6 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   // Floating point environment registers.
   markSuperRegs(Reserved, RISCV::FRM);
   markSuperRegs(Reserved, RISCV::FFLAGS);
-  markSuperRegs(Reserved, RISCV::FCSR);
 
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index a56f992d320e..20903b317180 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -550,16 +550,15 @@ def VRM8NoV0 : VReg<[vint8m8_t, vint16m8_t, vint32m8_t, vint64m8_t,
                      vfloat16m8_t, vfloat32m8_t, vfloat64m8_t],
              (add V8M8, V16M8, V24M8), 8>;
 
-defvar VMaskVTs = [vbool64_t, vbool32_t, vbool16_t, vbool8_t,
-                   vbool4_t, vbool2_t, vbool1_t];
+defvar VMaskVTs = [vbool1_t, vbool2_t, vbool4_t, vbool8_t, vbool16_t,
+                   vbool32_t, vbool64_t];
 
 def VMV0 : RegisterClass<"RISCV", VMaskVTs, 64, (add V0)> {
   let Size = 64;
 }
 
 // The register class is added for inline assembly for vector mask types.
-def VM : VReg<[vbool1_t, vbool2_t, vbool4_t, vbool8_t, vbool16_t,
-               vbool32_t, vbool64_t],
+def VM : VReg<VMaskVTs,
            (add (sequence "V%u", 8, 31),
                 (sequence "V%u", 0, 7)), 1>;
 
@@ -578,7 +577,6 @@ foreach m = LMULList.m in {
 // Special registers
 def FFLAGS : RISCVReg<0, "fflags">;
 def FRM    : RISCVReg<0, "frm">;
-def FCSR   : RISCVReg<0, "fcsr">;
 
 // Any type register. Used for .insn directives when we don't know what the
 // register types could be.
diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
index 14f59152ed42..d5a0932c8778 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -16,7 +16,8 @@ def RocketModel : SchedMachineModel {
   let IssueWidth = 1;        // 1 micro-op is dispatched per cycle.
   let LoadLatency = 3;
   let MispredictPenalty = 3;
-  let UnsupportedFeatures = [HasStdExtV, HasStdExtZvamo, HasStdExtZvlsseg];
+  let CompleteModel = false;
+  let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 5b435fcb16a2..7f9d0aabc4ed 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -15,7 +15,7 @@ def SiFive7Model : SchedMachineModel {
   let LoadLatency = 3;
   let MispredictPenalty = 3;
   let CompleteModel = 0;
-  let UnsupportedFeatures = [HasStdExtV, HasStdExtZvamo, HasStdExtZvlsseg];
+  let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg];
 }
 
 // The SiFive7 microarchitecture has two pipelines: A and B.
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index deb2a11f98f1..d0330e6984a5 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -51,7 +51,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool HasStdExtZbt = false;
   bool HasStdExtV = false;
   bool HasStdExtZvlsseg = false;
-  bool HasStdExtZvamo = false;
   bool HasStdExtZfhmin = false;
   bool HasStdExtZfh = false;
   bool HasRV64 = false;
@@ -118,7 +117,6 @@ public:
   bool hasStdExtZbt() const { return HasStdExtZbt; }
   bool hasStdExtV() const { return HasStdExtV; }
   bool hasStdExtZvlsseg() const { return HasStdExtZvlsseg; }
-  bool hasStdExtZvamo() const { return HasStdExtZvamo; }
   bool hasStdExtZfhmin() const { return HasStdExtZfhmin; }
   bool hasStdExtZfh() const { return HasStdExtZfh; }
   bool is64Bit() const { return HasRV64; }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 56f0952fafc9..c435430a1288 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -162,3 +162,94 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
       getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0, CostKind, I);
   return NumLoads * MemOpCost;
 }
+
+void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                                           TTI::UnrollingPreferences &UP,
+                                           OptimizationRemarkEmitter *ORE) {
+  // TODO: More tuning on benchmarks and metrics with changes as needed
+  //       would apply to all settings below to enable performance.
+
+  // Support explicit targets enabled for SiFive with the unrolling preferences
+  // below
+  bool UseDefaultPreferences = true;
+  if (ST->getTuneCPU().contains("sifive-e76") ||
+      ST->getTuneCPU().contains("sifive-s76") ||
+      ST->getTuneCPU().contains("sifive-u74") ||
+      ST->getTuneCPU().contains("sifive-7"))
+    UseDefaultPreferences = false;
+
+  if (UseDefaultPreferences)
+    return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
+
+  // Enable Upper bound unrolling universally, not dependant upon the conditions
+  // below.
+  UP.UpperBound = true;
+
+  // Disable loop unrolling for Oz and Os.
+  UP.OptSizeThreshold = 0;
+  UP.PartialOptSizeThreshold = 0;
+  if (L->getHeader()->getParent()->hasOptSize())
+    return;
+
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  LLVM_DEBUG(dbgs() << "Loop has:\n"
+                    << "Blocks: " << L->getNumBlocks() << "\n"
+                    << "Exit blocks: " << ExitingBlocks.size() << "\n");
+
+  // Only allow another exit other than the latch. This acts as an early exit
+  // as it mirrors the profitability calculation of the runtime unroller.
+  if (ExitingBlocks.size() > 2)
+    return;
+
+  // Limit the CFG of the loop body for targets with a branch predictor.
+  // Allowing 4 blocks permits if-then-else diamonds in the body.
+  if (L->getNumBlocks() > 4)
+    return;
+
+  // Don't unroll vectorized loops, including the remainder loop
+  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
+    return;
+
+  // Scan the loop: don't unroll loops with calls as this could prevent
+  // inlining.
+  InstructionCost Cost = 0;
+  for (auto *BB : L->getBlocks()) {
+    for (auto &I : *BB) {
+      // Initial setting - Don't unroll loops containing vectorized
+      // instructions.
+      if (I.getType()->isVectorTy())
+        return;
+
+      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
+          if (!isLoweredToCall(F))
+            continue;
+        }
+        return;
+      }
+
+      SmallVector<const Value *> Operands(I.operand_values());
+      Cost +=
+          getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
+
+  UP.Partial = true;
+  UP.Runtime = true;
+  UP.UnrollRemainder = true;
+  UP.UnrollAndJam = true;
+  UP.UnrollAndJamInnerLoopThreshold = 60;
+
+  // Force unrolling small loops can be very useful because of the branch
+  // taken cost of the backedge.
+  if (Cost < 12)
+    UP.Force = true;
+}
+
+void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                         TTI::PeelingPreferences &PP) {
+  BaseT::getPeelingPreferences(L, SE, PP);
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 675681616d6e..7353496f4684 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -73,6 +73,13 @@ public:
     llvm_unreachable("Unsupported register kind");
   }
 
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP,
+                               OptimizationRemarkEmitter *ORE);
+
+  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                             TTI::PeelingPreferences &PP);
+
   unsigned getMinVectorRegisterBitWidth() const {
     return ST->hasVInstructions() ? ST->getMinRVVVectorSizeInBits() : 0;
   }
@@ -178,7 +185,9 @@ public:
   }
 
   unsigned getMaxInterleaveFactor(unsigned VF) {
-    return ST->getMaxInterleaveFactor();
+    // If the loop will not be vectorized, don't interleave the loop.
+    // Let regular unroll to unroll the loop.
+    return VF == 1 ? 1 : ST->getMaxInterleaveFactor();
   }
 };
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 0f5e0b9672a9..538380263c3c 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -28,25 +28,43 @@ static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value,
   if (Kind < FirstTargetFixupKind)
     return Value;
 
+  auto checkFixupInRange = [&](int64_t Min, int64_t Max) -> bool {
+    int64_t SVal = int64_t(Value);
+    if (SVal < Min || SVal > Max) {
+      Ctx.reportError(Fixup.getLoc(), "operand out of range (" + Twine(SVal) +
+                                          " not between " + Twine(Min) +
+                                          " and " + Twine(Max) + ")");
+      return false;
+    }
+    return true;
+  };
+
+  auto handlePCRelFixupValue = [&](unsigned W) -> uint64_t {
+    if (Value % 2 != 0)
+      Ctx.reportError(Fixup.getLoc(), "Non-even PC relative offset.");
+    if (!checkFixupInRange(minIntN(W) * 2, maxIntN(W) * 2))
+      return 0;
+    return (int64_t)Value / 2;
+  };
+
   switch (unsigned(Kind)) {
   case SystemZ::FK_390_PC12DBL:
+    return handlePCRelFixupValue(12);
   case SystemZ::FK_390_PC16DBL:
+    return handlePCRelFixupValue(16);
   case SystemZ::FK_390_PC24DBL:
+    return handlePCRelFixupValue(24);
   case SystemZ::FK_390_PC32DBL:
-    return (int64_t)Value / 2;
+    return handlePCRelFixupValue(32);
 
   case SystemZ::FK_390_12:
-    if (!isUInt<12>(Value)) {
-      Ctx.reportError(Fixup.getLoc(), "displacement exceeds uint12");
+    if (!checkFixupInRange(0, maxUIntN(12)))
       return 0;
-    }
     return Value;
 
   case SystemZ::FK_390_20: {
-    if (!isInt<20>(Value)) {
-      Ctx.reportError(Fixup.getLoc(), "displacement exceeds int20");
+    if (!checkFixupInRange(minIntN(20), maxIntN(20)))
       return 0;
-    }
     // The high byte of a 20 bit displacement value comes first.
     uint64_t DLo = Value & 0xfff;
     uint64_t DHi = (Value >> 12) & 0xff;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index e280e4aaf3d8..c83796b8579b 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -197,7 +197,8 @@ getDispOpValue(const MCInst &MI, unsigned OpNum,
     // All instructions follow the pattern where the first displacement has a
     // 2 bytes offset, and the second one 4 bytes.
     unsigned ByteOffs = Fixups.size() == 0 ? 2 : 4;
-    Fixups.push_back(MCFixup::create(ByteOffs, MO.getExpr(), (MCFixupKind)Kind));
+    Fixups.push_back(MCFixup::create(ByteOffs, MO.getExpr(), (MCFixupKind)Kind,
+                                     MI.getLoc()));
     assert(Fixups.size() <= 2 && "More than two memory operands in MI?");
     return 0;
   }
@@ -296,6 +297,7 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        unsigned Kind, int64_t Offset,
                                        bool AllowTLS) const {
+  SMLoc Loc = MI.getLoc();
   const MCOperand &MO = MI.getOperand(OpNum);
   const MCExpr *Expr;
   if (MO.isImm())
@@ -311,13 +313,13 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
       Expr = MCBinaryExpr::createAdd(Expr, OffsetExpr, Ctx);
     }
   }
-  Fixups.push_back(MCFixup::create(Offset, Expr, (MCFixupKind)Kind));
+  Fixups.push_back(MCFixup::create(Offset, Expr, (MCFixupKind)Kind, Loc));
 
   // Output the fixup for the TLS marker if present.
   if (AllowTLS && OpNum + 1 < MI.getNumOperands()) {
     const MCOperand &MOTLS = MI.getOperand(OpNum + 1);
-    Fixups.push_back(MCFixup::create(0, MOTLS.getExpr(),
-                                     (MCFixupKind)SystemZ::FK_390_TLS_CALL));
+    Fixups.push_back(MCFixup::create(
+        0, MOTLS.getExpr(), (MCFixupKind)SystemZ::FK_390_TLS_CALL, Loc));
   }
   return 0;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 373023effb4a..a7ea5e1e4bf8 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -166,6 +166,7 @@ def CSR_SystemZ_NoRegs : CalleeSavedRegs<(add)>;
 // any non-leaf function and restored in the epilogue for use by the
 // return instruction so it functions exactly like a callee-saved register.
 def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 7, 15),
+                                                (sequence "R%dD", 4, 4),
                                                 (sequence "F%dD", 15, 8))>;
 
 def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add CSR_SystemZ_XPLINK64,
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 2f7cdfcf7bde..99ab4c5455d6 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -818,7 +818,7 @@ bool SystemZELFFrameLowering::usePackedStack(MachineFunction &MF) const {
 }
 
 SystemZXPLINKFrameLowering::SystemZXPLINKFrameLowering()
-    : SystemZFrameLowering(TargetFrameLowering::StackGrowsUp, Align(32), 128,
+    : SystemZFrameLowering(TargetFrameLowering::StackGrowsDown, Align(32), 0,
                            Align(32), /* StackRealignable */ false),
       RegSpillOffsets(-1) {
 
@@ -990,12 +990,184 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters(
   return true;
 }
 
+bool SystemZXPLINKFrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Restore FPRs in the normal TargetInstrInfo way.
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg))
+      TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
+                                &SystemZ::FP64BitRegClass, TRI);
+    if (SystemZ::VR128BitRegClass.contains(Reg))
+      TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
+                                &SystemZ::VR128BitRegClass, TRI);
+  }
+
+  // Restore call-saved GPRs (but not call-clobbered varargs, which at
+  // this point might hold return values).
+  SystemZ::GPRRegs RestoreGPRs = ZFI->getRestoreGPRRegs();
+  if (RestoreGPRs.LowGPR) {
+    assert(isInt<20>(Regs.getStackPointerBias() + RestoreGPRs.GPROffset));
+    if (RestoreGPRs.LowGPR == RestoreGPRs.HighGPR)
+      // Build an LG/L instruction.
+      BuildMI(MBB, MBBI, DL, TII->get(SystemZ::LG), RestoreGPRs.LowGPR)
+          .addReg(Regs.getStackPointerRegister())
+          .addImm(Regs.getStackPointerBias() + RestoreGPRs.GPROffset)
+          .addReg(0);
+    else {
+      // Build an LMG/LM instruction.
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::LMG));
+
+      // Add the explicit register operands.
+      MIB.addReg(RestoreGPRs.LowGPR, RegState::Define);
+      MIB.addReg(RestoreGPRs.HighGPR, RegState::Define);
+
+      // Add the address.
+      MIB.addReg(Regs.getStackPointerRegister());
+      MIB.addImm(Regs.getStackPointerBias() + RestoreGPRs.GPROffset);
+
+      // Do a second scan adding regs as being defined by instruction
+      for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+        unsigned Reg = CSI[I].getReg();
+        if (Reg > RestoreGPRs.LowGPR && Reg < RestoreGPRs.HighGPR)
+          MIB.addReg(Reg, RegState::ImplicitDefine);
+      }
+    }
+  }
+
+  return true;
+}
+
 void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF,
-                                              MachineBasicBlock &MBB) const {}
+                                              MachineBasicBlock &MBB) const {
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  auto *ZII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  MachineInstr *StoreInstr = nullptr;
+  bool HasFP = hasFP(MF);
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
+  uint64_t Offset = 0;
+
+  // TODO: Support leaf functions; only add size of save+reserved area when
+  // function is non-leaf.
+  MFFrame.setStackSize(MFFrame.getStackSize() + Regs.getCallFrameSize());
+  uint64_t StackSize = MFFrame.getStackSize();
+
+  // FIXME: Implement support for large stack sizes, when the stack extension
+  // routine needs to be called.
+  if (StackSize > 1024 * 1024) {
+    llvm_unreachable("Huge Stack Frame not yet supported on z/OS");
+  }
+
+  if (ZFI->getSpillGPRRegs().LowGPR) {
+    // Skip over the GPR saves.
+    if ((MBBI != MBB.end()) && ((MBBI->getOpcode() == SystemZ::STMG))) {
+      const int Operand = 3;
+      // Now we can set the offset for the operation, since now the Stack
+      // has been finalized.
+      Offset = Regs.getStackPointerBias() + MBBI->getOperand(Operand).getImm();
+      // Maximum displacement for STMG instruction.
+      if (isInt<20>(Offset - StackSize))
+        Offset -= StackSize;
+      else
+        StoreInstr = &*MBBI;
+      MBBI->getOperand(Operand).setImm(Offset);
+      ++MBBI;
+    } else
+      llvm_unreachable("Couldn't skip over GPR saves");
+  }
+
+  if (StackSize) {
+    MachineBasicBlock::iterator InsertPt = StoreInstr ? StoreInstr : MBBI;
+    // Allocate StackSize bytes.
+    int64_t Delta = -int64_t(StackSize);
+
+    // In case the STM(G) instruction also stores SP (R4), but the displacement
+    // is too large, the SP register is manipulated first before storing,
+    // resulting in the wrong value stored and retrieved later. In this case, we
+    // need to temporarily save the value of SP, and store it later to memory.
+    if (StoreInstr && HasFP) {
+      // Insert LR r0,r4 before STMG instruction.
+      BuildMI(MBB, InsertPt, DL, ZII->get(SystemZ::LGR))
+          .addReg(SystemZ::R0D, RegState::Define)
+          .addReg(SystemZ::R4D);
+      // Insert ST r0,xxx(,r4) after STMG instruction.
+      BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG))
+          .addReg(SystemZ::R0D, RegState::Kill)
+          .addReg(SystemZ::R4D)
+          .addImm(Offset)
+          .addReg(0);
+    }
+
+    emitIncrement(MBB, InsertPt, DL, Regs.getStackPointerRegister(), Delta,
+                  ZII);
+  }
+
+  if (HasFP) {
+    // Copy the base of the frame to Frame Pointer Register.
+    BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR),
+            Regs.getFramePointerRegister())
+        .addReg(Regs.getStackPointerRegister());
+
+    // Mark the FramePtr as live at the beginning of every block except
+    // the entry block.  (We'll have marked R8 as live on entry when
+    // saving the GPRs.)
+    for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I)
+      I->addLiveIn(Regs.getFramePointerRegister());
+  }
+}
 
 void SystemZXPLINKFrameLowering::emitEpilogue(MachineFunction &MF,
-                                              MachineBasicBlock &MBB) const {}
+                                              MachineBasicBlock &MBB) const {
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  auto *ZII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+
+  // Skip the return instruction.
+  assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks");
+
+  uint64_t StackSize = MFFrame.getStackSize();
+  if (StackSize) {
+    unsigned SPReg = Regs.getStackPointerRegister();
+    if (ZFI->getRestoreGPRRegs().LowGPR != SPReg) {
+      DebugLoc DL = MBBI->getDebugLoc();
+      emitIncrement(MBB, MBBI, DL, SPReg, StackSize, ZII);
+    }
+  }
+}
 
 bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const {
-  return false;
+  return (MF.getFrameInfo().hasVarSizedObjects());
+}
+
+void SystemZXPLINKFrameLowering::processFunctionBeforeFrameFinalized(
+    MachineFunction &MF, RegScavenger *RS) const {
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+
+  // Setup stack frame offset
+  MFFrame.setOffsetAdjustment(Regs.getStackPointerBias());
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index af219da79c32..106b9e8ebe06 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -115,11 +115,20 @@ public:
                                  ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
 
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBII,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
+
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   bool hasFP(const MachineFunction &MF) const override;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                           RegScavenger *RS) const override;
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 71432218068e..24de52850771 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1500,8 +1500,16 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
       assert(VA.isMemLoc() && "Argument not register or memory");
 
       // Create the frame index object for this incoming parameter.
-      int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
-                                     VA.getLocMemOffset(), true);
+      // FIXME: Pre-include call frame size in the offset, should not
+      // need to manually add it here.
+      int64_t ArgSPOffset = VA.getLocMemOffset();
+      if (Subtarget.isTargetXPLINK64()) {
+        auto &XPRegs =
+            Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+        ArgSPOffset += XPRegs.getCallFrameSize();
+      }
+      int FI =
+          MFI.CreateFixedObject(LocVT.getSizeInBits() / 8, ArgSPOffset, true);
 
       // Create the SelectionDAG nodes corresponding to a load
       // from this parameter.  Unpromoted ints and floats are
@@ -5714,6 +5722,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(OC);
     OPCODE(XC);
     OPCODE(CLC);
+    OPCODE(MEMSET_MVC);
     OPCODE(STPCPY);
     OPCODE(STRCMP);
     OPCODE(SEARCH_STRING);
@@ -7860,8 +7869,10 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
   return MBB;
 }
 
-MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
-    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
+MachineBasicBlock *
+SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode, bool IsMemset) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
@@ -7870,18 +7881,64 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
 
   MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
   uint64_t DestDisp = MI.getOperand(1).getImm();
-  MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
-  uint64_t SrcDisp = MI.getOperand(3).getImm();
-  MachineOperand &LengthMO = MI.getOperand(4);
+  MachineOperand SrcBase = MachineOperand::CreateReg(0U, false);
+  uint64_t SrcDisp;
+
+  // Fold the displacement Disp if it is out of range.
+  auto foldDisplIfNeeded = [&](MachineOperand &Base, uint64_t &Disp) -> void {
+    if (!isUInt<12>(Disp)) {
+      Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      unsigned Opcode = TII->getOpcodeForOffset(SystemZ::LA, Disp);
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode), Reg)
+        .add(Base).addImm(Disp).addReg(0);
+      Base = MachineOperand::CreateReg(Reg, false);
+      Disp = 0;
+    }
+  };
+
+  if (!IsMemset) {
+    SrcBase = earlyUseOperand(MI.getOperand(2));
+    SrcDisp = MI.getOperand(3).getImm();
+  } else {
+    SrcBase = DestBase;
+    SrcDisp = DestDisp++;
+    foldDisplIfNeeded(DestBase, DestDisp);
+  }
+
+  MachineOperand &LengthMO = MI.getOperand(IsMemset ? 2 : 4);
   bool IsImmForm = LengthMO.isImm();
   bool IsRegForm = !IsImmForm;
 
+  // Build and insert one Opcode of Length, with special treatment for memset.
+  auto insertMemMemOp = [&](MachineBasicBlock *InsMBB,
+                            MachineBasicBlock::iterator InsPos,
+                            MachineOperand DBase, uint64_t DDisp,
+                            MachineOperand SBase, uint64_t SDisp,
+                            unsigned Length) -> void {
+    assert(Length > 0 && Length <= 256 && "Building memory op with bad length.");
+    if (IsMemset) {
+      MachineOperand ByteMO = earlyUseOperand(MI.getOperand(3));
+      if (ByteMO.isImm())
+        BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::MVI))
+          .add(SBase).addImm(SDisp).add(ByteMO);
+      else
+        BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::STC))
+          .add(ByteMO).add(SBase).addImm(SDisp).addReg(0);
+      if (--Length == 0)
+        return;
+    }
+    BuildMI(*MBB, InsPos, DL, TII->get(Opcode))
+      .add(DBase).addImm(DDisp).addImm(Length)
+      .add(SBase).addImm(SDisp)
+      .setMemRefs(MI.memoperands());
+  };
+
   bool NeedsLoop = false;
   uint64_t ImmLength = 0;
-  Register LenMinus1Reg = SystemZ::NoRegister;
+  Register LenAdjReg = SystemZ::NoRegister;
   if (IsImmForm) {
     ImmLength = LengthMO.getImm();
-    ImmLength++; // Add back the '1' subtracted originally.
+    ImmLength += IsMemset ? 2 : 1; // Add back the subtracted adjustment.
     if (ImmLength == 0) {
       MI.eraseFromParent();
       return MBB;
@@ -7905,7 +7962,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
       NeedsLoop = true;
   } else {
     NeedsLoop = true;
-    LenMinus1Reg = LengthMO.getReg();
+    LenAdjReg = LengthMO.getReg();
   }
 
   // When generating more than one CLC, all but the last will need to
@@ -7923,17 +7980,17 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
       ImmLength &= 255;
     } else {
       BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg)
-        .addReg(LenMinus1Reg)
+        .addReg(LenAdjReg)
         .addReg(0)
         .addImm(8);
     }
 
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
     auto loadZeroAddress = [&]() -> MachineOperand {
       Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
       return MachineOperand::CreateReg(Reg, false);
     };
-    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
     if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
       DestBase = loadZeroAddress();
     if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
@@ -7968,14 +8025,41 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
       DoneMBB = SystemZ::emitBlockAfter(NextMBB);
 
       //  MBB:
-      //   # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB.
+      //   # Jump to AllDoneMBB if LenAdjReg means 0, or fall thru to StartMBB.
       BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
-        .addReg(LenMinus1Reg).addImm(-1);
+        .addReg(LenAdjReg).addImm(IsMemset ? -2 : -1);
       BuildMI(MBB, DL, TII->get(SystemZ::BRC))
         .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
         .addMBB(AllDoneMBB);
       MBB->addSuccessor(AllDoneMBB);
-      MBB->addSuccessor(StartMBB);
+      if (!IsMemset)
+        MBB->addSuccessor(StartMBB);
+      else {
+        // MemsetOneCheckMBB:
+        // # Jump to MemsetOneMBB for a memset of length 1, or
+        // # fall thru to StartMBB.
+        MachineBasicBlock *MemsetOneCheckMBB = SystemZ::emitBlockAfter(MBB);
+        MachineBasicBlock *MemsetOneMBB = SystemZ::emitBlockAfter(&*MF.rbegin());
+        MBB->addSuccessor(MemsetOneCheckMBB);
+        MBB = MemsetOneCheckMBB;
+        BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+          .addReg(LenAdjReg).addImm(-1);
+        BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+          .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
+          .addMBB(MemsetOneMBB);
+        MBB->addSuccessor(MemsetOneMBB, {10, 100});
+        MBB->addSuccessor(StartMBB, {90, 100});
+
+        // MemsetOneMBB:
+        // # Jump back to AllDoneMBB after a single MVI or STC.
+        MBB = MemsetOneMBB;
+        insertMemMemOp(MBB, MBB->end(),
+                       MachineOperand::CreateReg(StartDestReg, false), DestDisp,
+                       MachineOperand::CreateReg(StartSrcReg, false), SrcDisp,
+                       1);
+        BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(AllDoneMBB);
+        MBB->addSuccessor(AllDoneMBB);
+      }
 
       // StartMBB:
       // # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB.
@@ -8032,10 +8116,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     if (Opcode == SystemZ::MVC)
       BuildMI(MBB, DL, TII->get(SystemZ::PFD))
         .addImm(SystemZ::PFD_WRITE)
-        .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
-    BuildMI(MBB, DL, TII->get(Opcode))
-      .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
-      .addReg(ThisSrcReg).addImm(SrcDisp);
+        .addReg(ThisDestReg).addImm(DestDisp - IsMemset + 768).addReg(0);
+    insertMemMemOp(MBB, MBB->end(),
+                   MachineOperand::CreateReg(ThisDestReg, false), DestDisp,
+                   MachineOperand::CreateReg(ThisSrcReg, false), SrcDisp, 256);
     if (EndMBB) {
       BuildMI(MBB, DL, TII->get(SystemZ::BRC))
         .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
@@ -8075,7 +8159,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
       // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
       // # Use EXecute Relative Long for the remainder of the bytes. The target
       //   instruction of the EXRL will have a length field of 1 since 0 is an
-      //   illegal value. The number of bytes processed becomes (%LenMinus1Reg &
+      //   illegal value. The number of bytes processed becomes (%LenAdjReg &
       //   0xff) + 1.
       // # Fall through to AllDoneMBB.
       Register RemSrcReg  = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
@@ -8088,10 +8172,14 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
         BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
           .addReg(StartSrcReg).addMBB(StartMBB)
           .addReg(NextSrcReg).addMBB(NextMBB);
+      if (IsMemset)
+        insertMemMemOp(MBB, MBB->end(),
+                       MachineOperand::CreateReg(RemDestReg, false), DestDisp,
+                       MachineOperand::CreateReg(RemSrcReg, false), SrcDisp, 1);
       MachineInstrBuilder EXRL_MIB =
         BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
           .addImm(Opcode)
-          .addReg(LenMinus1Reg)
+          .addReg(LenAdjReg)
           .addReg(RemDestReg).addImm(DestDisp)
           .addReg(RemSrcReg).addImm(SrcDisp);
       MBB->addSuccessor(AllDoneMBB);
@@ -8107,32 +8195,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
   while (ImmLength > 0) {
     uint64_t ThisLength = std::min(ImmLength, uint64_t(256));
     // The previous iteration might have created out-of-range displacements.
-    // Apply them using LAY if so.
-    if (!isUInt<12>(DestDisp)) {
-      Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
-      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-          .add(DestBase)
-          .addImm(DestDisp)
-          .addReg(0);
-      DestBase = MachineOperand::CreateReg(Reg, false);
-      DestDisp = 0;
-    }
-    if (!isUInt<12>(SrcDisp)) {
-      Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
-      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-          .add(SrcBase)
-          .addImm(SrcDisp)
-          .addReg(0);
-      SrcBase = MachineOperand::CreateReg(Reg, false);
-      SrcDisp = 0;
-    }
-    BuildMI(*MBB, MI, DL, TII->get(Opcode))
-        .add(DestBase)
-        .addImm(DestDisp)
-        .addImm(ThisLength)
-        .add(SrcBase)
-        .addImm(SrcDisp)
-        .setMemRefs(MI.memoperands());
+    // Apply them using LA/LAY if so.
+    foldDisplIfNeeded(DestBase, DestDisp);
+    foldDisplIfNeeded(SrcBase, SrcDisp);
+    insertMemMemOp(MBB, MI, DestBase, DestDisp, SrcBase, SrcDisp, ThisLength);
     DestDisp += ThisLength;
     SrcDisp += ThisLength;
     ImmLength -= ThisLength;
@@ -8630,6 +8696,11 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
   case SystemZ::CLCImm:
   case SystemZ::CLCReg:
     return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
+  case SystemZ::MemsetImmImm:
+  case SystemZ::MemsetImmReg:
+  case SystemZ::MemsetRegImm:
+  case SystemZ::MemsetRegReg:
+    return emitMemMemWrapper(MI, MBB, SystemZ::MVC, true/*IsMemset*/);
   case SystemZ::CLSTLoop:
     return emitStringWrapper(MI, MBB, SystemZ::CLST);
   case SystemZ::MVSTLoop:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 461f804ca55e..940c0a857ea4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -126,6 +126,9 @@ enum NodeType : unsigned {
   // as for MVC.
   CLC,
 
+  // Use MVC to set a block of memory after storing the first byte.
+  MEMSET_MVC,
+
   // Use an MVST-based sequence to implement stpcpy().
   STPCPY,
 
@@ -709,7 +712,8 @@ private:
   MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI,
                                         MachineBasicBlock *BB) const;
   MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB,
-                                       unsigned Opcode) const;
+                                       unsigned Opcode,
+                                       bool IsMemset = false) const;
   MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB,
                                        unsigned Opcode) const;
   MachineBasicBlock *emitTransactionBegin(MachineInstr &MI,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index cd60fff1ab11..e513befd0d6f 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5256,6 +5256,16 @@ class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
   let Constraints = "$R1 = $R1src";
 }
 
+class MemsetPseudo<DAGOperand lenop, DAGOperand byteop>
+  : Pseudo<(outs), (ins bdaddr12only:$dest, lenop:$length, byteop:$B),
+           [(z_memset_mvc bdaddr12only:$dest, lenop:$length, byteop:$B)]> {
+  let Defs = [CC];
+  let mayLoad = 1;
+  let mayStore = 1;
+  let usesCustomInserter = 1;
+  let hasNoSchedulingInfo = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Multiclasses that emit both real and pseudo instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index e4760229fd6b..84f1e0fb428c 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -510,6 +510,12 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in {
   def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
 }
 
+// Memset[Length][Byte] pseudos.
+def MemsetImmImm : MemsetPseudo<imm64, imm32zx8trunc>;
+def MemsetImmReg : MemsetPseudo<imm64, GR32>;
+def MemsetRegImm : MemsetPseudo<ADDR64, imm32zx8trunc>;
+def MemsetRegReg : MemsetPseudo<ADDR64, GR32>;
+
 // Move right.
 let Predicates = [FeatureMiscellaneousExtensions3],
     mayLoad = 1, mayStore = 1, Uses = [R0L] in
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 927d97233286..9935416559bc 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -102,6 +102,10 @@ def SDT_ZMemMemLengthCC     : SDTypeProfile<1, 3,
                                              SDTCisPtrTy<1>,
                                              SDTCisPtrTy<2>,
                                              SDTCisVT<3, i64>]>;
+def SDT_ZMemsetMVC          : SDTypeProfile<0, 3,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisVT<1, i64>,
+                                             SDTCisVT<2, i32>]>;
 def SDT_ZString             : SDTypeProfile<1, 3,
                                             [SDTCisPtrTy<0>,
                                              SDTCisPtrTy<1>,
@@ -413,6 +417,8 @@ def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
+def z_memset_mvc        : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
 def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index f38e93109967..db4b4879b33a 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -17,29 +17,44 @@ using namespace llvm;
 
 #define DEBUG_TYPE "systemz-selectiondag-info"
 
-static SDVTList getMemMemVTs(unsigned Op, SelectionDAG &DAG) {
-  return Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
-                               : DAG.getVTList(MVT::Other);
+static unsigned getMemMemLenAdj(unsigned Op) {
+  return Op == SystemZISD::MEMSET_MVC ? 2 : 1;
 }
 
-// Emit a mem-mem operation after subtracting one from size, which will be
-// added back during pseudo expansion. As the Reg case emitted here may be
-// converted by DAGCombiner into having an Imm length, they are both emitted
-// the same way.
+static SDValue createMemMemNode(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
+                                SDValue Chain, SDValue Dst, SDValue Src,
+                                SDValue LenAdj, SDValue Byte) {
+  SDVTList VTs = Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
+                                       : DAG.getVTList(MVT::Other);
+  SmallVector<SDValue, 6> Ops;
+  if (Op == SystemZISD::MEMSET_MVC)
+    Ops = { Chain, Dst, LenAdj, Byte };
+  else
+    Ops = { Chain, Dst, Src, LenAdj };
+  return DAG.getNode(Op, DL, VTs, Ops);
+}
+
+// Emit a mem-mem operation after subtracting one (or two for memset) from
+// size, which will be added back during pseudo expansion. As the Reg case
+// emitted here may be converted by DAGCombiner into having an Imm length,
+// they are both emitted the same way.
 static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
                              SDValue Chain, SDValue Dst, SDValue Src,
-                             uint64_t Size) {
-  return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src,
-                     DAG.getConstant(Size - 1, DL, Src.getValueType()));
+                             uint64_t Size, SDValue Byte = SDValue()) {
+  unsigned Adj = getMemMemLenAdj(Op);
+  assert(Size >= Adj && "Adjusted length overflow.");
+  SDValue LenAdj = DAG.getConstant(Size - Adj, DL, Dst.getValueType());
+  return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
 }
 
 static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
                              SDValue Chain, SDValue Dst, SDValue Src,
-                             SDValue Size) {
-  SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
-                                  DAG.getZExtOrTrunc(Size, DL, MVT::i64),
-                                  DAG.getConstant(-1, DL, MVT::i64));
-  return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, LenMinus1);
+                             SDValue Size, SDValue Byte = SDValue()) {
+  int64_t Adj = getMemMemLenAdj(Op);
+  SDValue LenAdj = DAG.getNode(ISD::ADD, DL, MVT::i64,
+                               DAG.getZExtOrTrunc(Size, DL, MVT::i64),
+                               DAG.getConstant(0 - Adj, DL, MVT::i64));
+  return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
 }
 
 SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
@@ -127,13 +142,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
     if (CByte && CByte->getZExtValue() == 0)
       return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes);
 
-    // Copy the byte to the first location and then use MVC to copy
-    // it to the rest.
-    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
-    SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
-                                   DAG.getConstant(1, DL, PtrVT));
-    return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst,
-                         Bytes - 1);
+    return emitMemMemImm(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
+                         Bytes, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
   }
 
   // Variable length
@@ -141,7 +151,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
     // Handle the special case of a variable length memset of 0 with XC.
     return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size);
 
-  return SDValue();
+  return emitMemMemReg(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
+                       Size, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
 }
 
 // Convert the current CC value into an integer that is 0 if CC == 0,
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index 7e92e4b33812..fd9dc32b04f5 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -84,6 +84,8 @@ class VEAsmParser : public MCTargetAsmParser {
   StringRef splitMnemonic(StringRef Name, SMLoc NameLoc,
                           OperandVector *Operands);
 
+  bool parseLiteralValues(unsigned Size, SMLoc L);
+
 public:
   VEAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
               const MCInstrInfo &MII, const MCTargetOptions &Options)
@@ -994,10 +996,43 @@ bool VEAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 }
 
 bool VEAsmParser::ParseDirective(AsmToken DirectiveID) {
+  std::string IDVal = DirectiveID.getIdentifier().lower();
+
+  // Defines VE specific directives.  Reference is "Vector Engine Assembly
+  // Language Reference Manual":
+  // https://www.hpc.nec/documents/sdk/pdfs/VectorEngine-as-manual-v1.3.pdf
+
+  // The .word is 4 bytes long on VE.
+  if (IDVal == ".word")
+    return parseLiteralValues(4, DirectiveID.getLoc());
+
+  // The .long is 8 bytes long on VE.
+  if (IDVal == ".long")
+    return parseLiteralValues(8, DirectiveID.getLoc());
+
+  // The .llong is 8 bytes long on VE.
+  if (IDVal == ".llong")
+    return parseLiteralValues(8, DirectiveID.getLoc());
+
   // Let the MC layer to handle other directives.
   return true;
 }
 
+/// parseLiteralValues
+///  ::= .word expression [, expression]*
+///  ::= .long expression [, expression]*
+///  ::= .llong expression [, expression]*
+bool VEAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
+  auto parseOne = [&]() -> bool {
+    const MCExpr *Value;
+    if (getParser().parseExpression(Value))
+      return true;
+    getParser().getStreamer().emitValue(Value, Size, L);
+    return false;
+  };
+  return (parseMany(parseOne));
+}
+
 /// Extract \code @lo32/@hi32/etc \endcode modifier from expression.
 /// Recursively scan the expression and check for VK_VE_HI32/LO32/etc
 /// symbol variants.  If all symbols with modifier use the same
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
index 29c209934680..38d163b37080 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
@@ -42,6 +42,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   case VE::fixup_ve_tpoff_hi32:
     return (Value >> 32) & 0xffffffff;
   case VE::fixup_ve_reflong:
+  case VE::fixup_ve_srel32:
   case VE::fixup_ve_lo32:
   case VE::fixup_ve_pc_lo32:
   case VE::fixup_ve_got_lo32:
@@ -68,6 +69,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case FK_Data_4:
   case FK_PCRel_4:
   case VE::fixup_ve_reflong:
+  case VE::fixup_ve_srel32:
   case VE::fixup_ve_hi32:
   case VE::fixup_ve_lo32:
   case VE::fixup_ve_pc_hi32:
@@ -103,6 +105,7 @@ public:
     const static MCFixupKindInfo Infos[VE::NumTargetFixupKinds] = {
         // name, offset, bits, flags
         {"fixup_ve_reflong", 0, 32, 0},
+        {"fixup_ve_srel32", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
         {"fixup_ve_hi32", 0, 32, 0},
         {"fixup_ve_lo32", 0, 32, 0},
         {"fixup_ve_pc_hi32", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
index 741e8320a941..ae065407409a 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
@@ -9,6 +9,7 @@
 #include "VEFixupKinds.h"
 #include "VEMCExpr.h"
 #include "VEMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
@@ -46,16 +47,29 @@ unsigned VEELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
   if (IsPCRel) {
     switch (Fixup.getTargetKind()) {
     default:
-      llvm_unreachable("Unimplemented fixup -> relocation");
+      Ctx.reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind");
+      return ELF::R_VE_NONE;
+    case FK_Data_1:
     case FK_PCRel_1:
-      llvm_unreachable("Unimplemented fixup fk_data_1 -> relocation");
+      Ctx.reportError(Fixup.getLoc(),
+                      "1-byte pc-relative data relocation is not supported");
+      return ELF::R_VE_NONE;
+    case FK_Data_2:
     case FK_PCRel_2:
-      llvm_unreachable("Unimplemented fixup fk_data_2 -> relocation");
-    // FIXME: relative kind?
+      Ctx.reportError(Fixup.getLoc(),
+                      "2-byte pc-relative data relocation is not supported");
+      return ELF::R_VE_NONE;
+    case FK_Data_4:
     case FK_PCRel_4:
-      return ELF::R_VE_REFLONG;
+      return ELF::R_VE_SREL32;
+    case FK_Data_8:
     case FK_PCRel_8:
-      return ELF::R_VE_REFQUAD;
+      Ctx.reportError(Fixup.getLoc(),
+                      "8-byte pc-relative data relocation is not supported");
+      return ELF::R_VE_NONE;
+    case VE::fixup_ve_reflong:
+    case VE::fixup_ve_srel32:
+      return ELF::R_VE_SREL32;
     case VE::fixup_ve_pc_hi32:
       return ELF::R_VE_PC_HI32;
     case VE::fixup_ve_pc_lo32:
@@ -65,25 +79,36 @@ unsigned VEELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
 
   switch (Fixup.getTargetKind()) {
   default:
-    llvm_unreachable("Unimplemented fixup -> relocation");
+    Ctx.reportError(Fixup.getLoc(), "Unknown ELF relocation type");
+    return ELF::R_VE_NONE;
   case FK_Data_1:
-    llvm_unreachable("Unimplemented fixup fk_data_1 -> relocation");
+    Ctx.reportError(Fixup.getLoc(), "1-byte data relocation is not supported");
+    return ELF::R_VE_NONE;
   case FK_Data_2:
-    llvm_unreachable("Unimplemented fixup fk_data_2 -> relocation");
+    Ctx.reportError(Fixup.getLoc(), "2-byte data relocation is not supported");
+    return ELF::R_VE_NONE;
   case FK_Data_4:
     return ELF::R_VE_REFLONG;
   case FK_Data_8:
     return ELF::R_VE_REFQUAD;
   case VE::fixup_ve_reflong:
     return ELF::R_VE_REFLONG;
+  case VE::fixup_ve_srel32:
+    Ctx.reportError(Fixup.getLoc(),
+                    "A non pc-relative srel32 relocation is not supported");
+    return ELF::R_VE_NONE;
   case VE::fixup_ve_hi32:
     return ELF::R_VE_HI32;
   case VE::fixup_ve_lo32:
     return ELF::R_VE_LO32;
   case VE::fixup_ve_pc_hi32:
-    llvm_unreachable("Unimplemented fixup pc_hi32 -> relocation");
+    Ctx.reportError(Fixup.getLoc(),
+                    "A non pc-relative pc_hi32 relocation is not supported");
+    return ELF::R_VE_NONE;
   case VE::fixup_ve_pc_lo32:
-    llvm_unreachable("Unimplemented fixup pc_lo32 -> relocation");
+    Ctx.reportError(Fixup.getLoc(),
+                    "A non pc-relative pc_lo32 relocation is not supported");
+    return ELF::R_VE_NONE;
   case VE::fixup_ve_got_hi32:
     return ELF::R_VE_GOT_HI32;
   case VE::fixup_ve_got_lo32:
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h b/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h
index 5d5dc1c5c891..46b995cee840 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h
@@ -17,6 +17,9 @@ enum Fixups {
   /// fixup_ve_reflong - 32-bit fixup corresponding to foo
   fixup_ve_reflong = FirstTargetFixupKind,
 
+  /// fixup_ve_srel32 - 32-bit fixup corresponding to foo for relative branch
+  fixup_ve_srel32,
+
   /// fixup_ve_hi32 - 32-bit fixup corresponding to foo@hi
   fixup_ve_hi32,
 
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
index d50d8fcae9da..65bb0cf8b0d7 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
@@ -102,11 +102,11 @@ unsigned VEMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                             const MCSubtargetInfo &STI) const {
   if (MO.isReg())
     return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
-
   if (MO.isImm())
-    return MO.getImm();
+    return static_cast<unsigned>(MO.getImm());
 
   assert(MO.isExpr());
+
   const MCExpr *Expr = MO.getExpr();
   if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Expr)) {
     MCFixupKind Kind = (MCFixupKind)SExpr->getFixupKind();
@@ -131,7 +131,7 @@ VEMCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
     return getMachineOpValue(MI, MO, Fixups, STI);
 
   Fixups.push_back(
-      MCFixup::create(0, MO.getExpr(), (MCFixupKind)VE::fixup_ve_pc_lo32));
+      MCFixup::create(0, MO.getExpr(), (MCFixupKind)VE::fixup_ve_srel32));
   return 0;
 }
 
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
index a3ce3b3309be..4d45918ad0aa 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
@@ -12,11 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "VEMCExpr.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCValue.h"
 
 using namespace llvm;
 
@@ -174,7 +175,13 @@ VE::Fixups VEMCExpr::getFixupKind(VEMCExpr::VariantKind Kind) {
 bool VEMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
                                          const MCAsmLayout *Layout,
                                          const MCFixup *Fixup) const {
-  return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup);
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
 }
 
 static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 32315543826a..5ef223d6030b 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -1720,7 +1720,7 @@ SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
 
-#define ADD_BINARY_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
+#define ADD_BINARY_VVP_OP(VVP_NAME, VP_NAME, ISD_NAME) case ISD::ISD_NAME:
 #include "VVPNodes.def"
     return lowerToVVP(Op, DAG);
   }
diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td
index 2c88d5099a7b..99566e91ec11 100644
--- a/llvm/lib/Target/VE/VVPInstrInfo.td
+++ b/llvm/lib/Target/VE/VVPInstrInfo.td
@@ -29,6 +29,16 @@ def SDTIntBinOpVVP : SDTypeProfile<1, 4, [     // vp_add, vp_and, etc.
   IsVLVT<4>
 ]>;
 
+// BinaryFPOp(x,y,mask,vl)
+def SDTFPBinOpVVP : SDTypeProfile<1, 4, [      // vvp_fadd, etc.
+  SDTCisSameAs<0, 1>,
+  SDTCisSameAs<0, 2>,
+  SDTCisFP<0>,
+  SDTCisInt<3>,
+  SDTCisSameNumEltsAs<0, 3>,
+  IsVLVT<4>
+]>;
+
 // Binary operator commutative pattern.
 class vvp_commutative<SDNode RootOp> :
   PatFrags<
@@ -40,7 +50,32 @@ class vvp_commutative<SDNode RootOp> :
 def vvp_add    : SDNode<"VEISD::VVP_ADD",  SDTIntBinOpVVP>;
 def c_vvp_add  : vvp_commutative<vvp_add>;
 
+def vvp_sub    : SDNode<"VEISD::VVP_SUB",  SDTIntBinOpVVP>;
+
+def vvp_mul    : SDNode<"VEISD::VVP_MUL",  SDTIntBinOpVVP>;
+def c_vvp_mul  : vvp_commutative<vvp_mul>;
+
+def vvp_sdiv   : SDNode<"VEISD::VVP_SDIV", SDTIntBinOpVVP>;
+def vvp_udiv   : SDNode<"VEISD::VVP_UDIV", SDTIntBinOpVVP>;
+
 def vvp_and    : SDNode<"VEISD::VVP_AND",  SDTIntBinOpVVP>;
 def c_vvp_and  : vvp_commutative<vvp_and>;
 
+def vvp_or     : SDNode<"VEISD::VVP_OR",  SDTIntBinOpVVP>;
+def c_vvp_or   : vvp_commutative<vvp_or>;
+
+def vvp_xor    : SDNode<"VEISD::VVP_XOR",  SDTIntBinOpVVP>;
+def c_vvp_xor  : vvp_commutative<vvp_xor>;
+
+def vvp_srl    : SDNode<"VEISD::VVP_SRL",  SDTIntBinOpVVP>;
+def vvp_sra    : SDNode<"VEISD::VVP_SRA",  SDTIntBinOpVVP>;
+def vvp_shl    : SDNode<"VEISD::VVP_SHL",  SDTIntBinOpVVP>;
+
+def vvp_fadd    : SDNode<"VEISD::VVP_FADD",  SDTFPBinOpVVP>;
+def c_vvp_fadd  : vvp_commutative<vvp_fadd>;
+def vvp_fsub    : SDNode<"VEISD::VVP_FSUB",  SDTFPBinOpVVP>;
+def vvp_fmul    : SDNode<"VEISD::VVP_FMUL",  SDTFPBinOpVVP>;
+def c_vvp_fmul  : vvp_commutative<vvp_fmul>;
+def vvp_fdiv    : SDNode<"VEISD::VVP_FDIV",  SDTFPBinOpVVP>;
+
 // } Binary Operators
diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
index ac03e0bf627e..8d5d9d103547 100644
--- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -17,54 +17,177 @@
 //===----------------------------------------------------------------------===//
 include "VVPInstrInfo.td"
 
-multiclass VectorBinaryArith<
-    SDPatternOperator OpNode,
-    ValueType ScalarVT, ValueType DataVT, ValueType MaskVT,
-    string OpBaseName> {
-  // No mask.
+multiclass Binary_rv<SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  // Masked with select, broadcast.
+  // TODO
+
+  // Unmasked, broadcast.
   def : Pat<(OpNode
-                (any_broadcast ScalarVT:$sx),
-                DataVT:$vy, (MaskVT true_mask), i32:$avl),
+                (any_broadcast ScalarVT:$sx), DataVT:$vy,
+                (MaskVT true_mask),
+                i32:$avl),
             (!cast<Instruction>(OpBaseName#"rvl")
                 ScalarVT:$sx, $vy, $avl)>;
-  def : Pat<(OpNode DataVT:$vx, DataVT:$vy, (MaskVT true_mask), i32:$avl),
+  // Masked, broadcast.
+  def : Pat<(OpNode
+                (any_broadcast ScalarVT:$sx), DataVT:$vy,
+                MaskVT:$mask,
+                i32:$avl),
+            (!cast<Instruction>(OpBaseName#"rvml")
+                ScalarVT:$sx, $vy, $mask, $avl)>;
+}
+
+multiclass Binary_vr<SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  // Masked with select, broadcast.
+  // TODO
+
+  // Unmasked, broadcast.
+  def : Pat<(OpNode
+                DataVT:$vx, (any_broadcast ScalarVT:$sy),
+                (MaskVT true_mask),
+                i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vrl")
+                $vx, ScalarVT:$sy, $avl)>;
+  // Masked, broadcast.
+  def : Pat<(OpNode
+                DataVT:$vx, (any_broadcast ScalarVT:$sy),
+                MaskVT:$mask,
+                i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vrml")
+                $vx, ScalarVT:$sy, $mask, $avl)>;
+}
+
+multiclass Binary_vv<SDPatternOperator OpNode,
+    ValueType DataVT,
+    ValueType MaskVT, string OpBaseName> {
+  // Masked with select.
+  // TODO
+
+  // Unmasked.
+  def : Pat<(OpNode
+                DataVT:$vx, DataVT:$vy,
+                (MaskVT true_mask),
+                i32:$avl),
             (!cast<Instruction>(OpBaseName#"vvl")
                 $vx, $vy, $avl)>;
 
-  // Mask.
+  // Masked.
   def : Pat<(OpNode
-                (any_broadcast ScalarVT:$sx),
-                DataVT:$vy, MaskVT:$mask, i32:$avl),
-            (!cast<Instruction>(OpBaseName#"rvml")
-                ScalarVT:$sx, $vy, $mask, $avl)>;
-  def : Pat<(OpNode DataVT:$vx, DataVT:$vy, MaskVT:$mask, i32:$avl),
+                DataVT:$vx, DataVT:$vy,
+                MaskVT:$mask,
+                i32:$avl),
             (!cast<Instruction>(OpBaseName#"vvml")
                 $vx, $vy, $mask, $avl)>;
+}
 
-  // TODO We do not specify patterns for the immediate variants here. There
-  // will be an immediate folding pass that takes care of switching to the
-  // immediate variant where applicable.
+multiclass Binary_rv_vv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT, ValueType MaskVT,
+    string OpBaseName> {
+  defm : Binary_rv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+  defm : Binary_vv<OpNode, DataVT, MaskVT, OpBaseName>;
+}
+
+multiclass Binary_vr_vv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT, ValueType MaskVT,
+    string OpBaseName> {
+  defm : Binary_vr<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+  defm : Binary_vv<OpNode, DataVT, MaskVT, OpBaseName>;
+}
 
-  // TODO Fold vvp_select into passthru.
+multiclass Binary_rv_vr_vv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT, ValueType MaskVT,
+    string OpBaseName> {
+  defm : Binary_rv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+  defm : Binary_vr_vv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
 }
 
 // Expand both 64bit and 32 bit variant (256 elements)
-multiclass VectorBinaryArith_ShortLong<
+multiclass Binary_rv_vv_ShortLong<
+    SDPatternOperator OpNode,
+    ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
+    ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
+  defm : Binary_rv_vv<OpNode,
+                      LongScalarVT, LongDataVT, v256i1,
+                      LongOpBaseName>;
+  defm : Binary_rv_vv<OpNode,
+                      ShortScalarVT, ShortDataVT, v256i1,
+                      ShortOpBaseName>;
+}
+
+multiclass Binary_vr_vv_ShortLong<
+    SDPatternOperator OpNode,
+    ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
+    ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
+  defm : Binary_vr_vv<OpNode,
+                      LongScalarVT, LongDataVT, v256i1,
+                      LongOpBaseName>;
+  defm : Binary_vr_vv<OpNode,
+                      ShortScalarVT, ShortDataVT, v256i1,
+                      ShortOpBaseName>;
+}
+
+multiclass Binary_rv_vr_vv_ShortLong<
     SDPatternOperator OpNode,
     ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
     ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
-  defm : VectorBinaryArith<OpNode,
-                           LongScalarVT, LongDataVT, v256i1,
-                           LongOpBaseName>;
-  defm : VectorBinaryArith<OpNode,
-                           ShortScalarVT, ShortDataVT, v256i1,
-                           ShortOpBaseName>;
+  defm : Binary_rv_vr_vv<OpNode,
+                      LongScalarVT, LongDataVT, v256i1,
+                      LongOpBaseName>;
+  defm : Binary_rv_vr_vv<OpNode,
+                      ShortScalarVT, ShortDataVT, v256i1,
+                      ShortOpBaseName>;
 }
 
+defm : Binary_rv_vv_ShortLong<c_vvp_add,
+                              i64, v256i64, "VADDSL",
+                              i32, v256i32, "VADDSWSX">;
+defm : Binary_rv_vv_ShortLong<vvp_sub,
+                              i64, v256i64, "VSUBSL",
+                              i32, v256i32, "VSUBSWSX">;
+defm : Binary_rv_vv_ShortLong<c_vvp_mul,
+                              i64, v256i64, "VMULSL",
+                              i32, v256i32, "VMULSWSX">;
+defm : Binary_rv_vr_vv_ShortLong<vvp_sdiv,
+                              i64, v256i64, "VDIVSL",
+                              i32, v256i32, "VDIVSWSX">;
+defm : Binary_rv_vr_vv_ShortLong<vvp_udiv,
+                              i64, v256i64, "VDIVUL",
+                              i32, v256i32, "VDIVUW">;
+defm : Binary_rv_vv_ShortLong<c_vvp_and,
+                              i64, v256i64, "VAND",
+                              i32, v256i32, "PVANDLO">;
+defm : Binary_rv_vv_ShortLong<c_vvp_or,
+                              i64, v256i64, "VOR",
+                              i32, v256i32, "PVORLO">;
+defm : Binary_rv_vv_ShortLong<c_vvp_xor,
+                              i64, v256i64, "VXOR",
+                              i32, v256i32, "PVXORLO">;
+defm : Binary_vr_vv_ShortLong<vvp_shl,
+                              i64, v256i64, "VSLL",
+                              i32, v256i32, "PVSLLLO">;
+defm : Binary_vr_vv_ShortLong<vvp_sra,
+                              i64, v256i64, "VSRAL",
+                              i32, v256i32, "PVSRALO">;
+defm : Binary_vr_vv_ShortLong<vvp_srl,
+                              i64, v256i64, "VSRL",
+                              i32, v256i32, "PVSRLLO">;
 
-defm : VectorBinaryArith_ShortLong<c_vvp_add,
-                                   i64, v256i64, "VADDSL",
-                                   i32, v256i32, "VADDSWSX">;
-defm : VectorBinaryArith_ShortLong<c_vvp_and,
-                                   i64, v256i64, "VAND",
-                                   i32, v256i32, "PVANDLO">;
+defm : Binary_rv_vv_ShortLong<c_vvp_fadd,
+                              f64, v256f64, "VFADDD",
+                              f32, v256f32, "PVFADDUP">;
+defm : Binary_rv_vv_ShortLong<c_vvp_fmul,
+                              f64, v256f64, "VFMULD",
+                              f32, v256f32, "PVFMULUP">;
+defm : Binary_rv_vv_ShortLong<vvp_fsub,
+                              f64, v256f64, "VFSUBD",
+                              f32, v256f32, "PVFSUBUP">;
+defm : Binary_rv_vr_vv_ShortLong<vvp_fdiv,
+                              f64, v256f64, "VFDIVD",
+                              f32, v256f32, "VFDIVS">;
diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def
index a68402e9ea10..8a9231f7d3e6 100644
--- a/llvm/lib/Target/VE/VVPNodes.def
+++ b/llvm/lib/Target/VE/VVPNodes.def
@@ -28,14 +28,38 @@
 /// \p VVPName is a VVP Binary operator.
 /// \p SDNAME is the generic SD opcode corresponding to \p VVPName.
 #ifndef ADD_BINARY_VVP_OP
-#define ADD_BINARY_VVP_OP(X,Y) ADD_VVP_OP(X,Y) HANDLE_VP_TO_VVP(VP_##Y, X)
+#define ADD_BINARY_VVP_OP(VVPNAME,VPNAME,SDNAME) \
+            ADD_VVP_OP(VVPNAME,SDNAME) \
+            HANDLE_VP_TO_VVP(VPNAME, VVPNAME)
+#endif
+
+#ifndef ADD_BINARY_VVP_OP_COMPACT
+#define ADD_BINARY_VVP_OP_COMPACT(NAME) \
+    ADD_BINARY_VVP_OP(VVP_##NAME,VP_##NAME,NAME)
 #endif
 
 // Integer arithmetic.
-ADD_BINARY_VVP_OP(VVP_ADD,ADD)
+ADD_BINARY_VVP_OP_COMPACT(ADD)
+ADD_BINARY_VVP_OP_COMPACT(SUB)
+ADD_BINARY_VVP_OP_COMPACT(MUL)
+ADD_BINARY_VVP_OP_COMPACT(UDIV)
+ADD_BINARY_VVP_OP_COMPACT(SDIV)
 
-ADD_BINARY_VVP_OP(VVP_AND,AND)
+ADD_BINARY_VVP_OP(VVP_SRA,VP_ASHR,SRA)
+ADD_BINARY_VVP_OP(VVP_SRL,VP_LSHR,SRL)
+ADD_BINARY_VVP_OP_COMPACT(SHL)
+
+ADD_BINARY_VVP_OP_COMPACT(AND)
+ADD_BINARY_VVP_OP_COMPACT(OR)
+ADD_BINARY_VVP_OP_COMPACT(XOR)
+
+// FP arithmetic.
+ADD_BINARY_VVP_OP_COMPACT(FADD)
+ADD_BINARY_VVP_OP_COMPACT(FSUB)
+ADD_BINARY_VVP_OP_COMPACT(FMUL)
+ADD_BINARY_VVP_OP_COMPACT(FDIV)
 
-#undef HANDLE_VP_TO_VVP
 #undef ADD_BINARY_VVP_OP
+#undef ADD_BINARY_VVP_OP_COMPACT
 #undef ADD_VVP_OP
+#undef HANDLE_VP_TO_VVP
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 7d1e6c553f81..56689d3ee06b 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -571,7 +571,6 @@ public:
     // proper nesting.
     bool ExpectBlockType = false;
     bool ExpectFuncType = false;
-    bool ExpectHeapType = false;
     std::unique_ptr<WebAssemblyOperand> FunctionTable;
     if (Name == "block") {
       push(Block);
@@ -624,8 +623,6 @@ public:
       if (parseFunctionTableOperand(&FunctionTable))
         return true;
       ExpectFuncType = true;
-    } else if (Name == "ref.null") {
-      ExpectHeapType = true;
     }
 
     if (ExpectFuncType || (ExpectBlockType && Lexer.is(AsmToken::LParen))) {
@@ -670,15 +667,6 @@ public:
             return error("Unknown block type: ", Id);
           addBlockTypeOperand(Operands, NameLoc, BT);
           Parser.Lex();
-        } else if (ExpectHeapType) {
-          auto HeapType = WebAssembly::parseHeapType(Id.getString());
-          if (HeapType == WebAssembly::HeapType::Invalid) {
-            return error("Expected a heap type: ", Id);
-          }
-          Operands.push_back(std::make_unique<WebAssemblyOperand>(
-              WebAssemblyOperand::Integer, Id.getLoc(), Id.getEndLoc(),
-              WebAssemblyOperand::IntOp{static_cast<int64_t>(HeapType)}));
-          Parser.Lex();
         } else {
           // Assume this identifier is a label.
           const MCExpr *Val;
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
index a6b5d4252f2f..128ce5c4fec0 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp
@@ -112,9 +112,18 @@ bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCInst &Inst,
   return false;
 }
 
-bool WebAssemblyAsmTypeCheck::checkEnd(SMLoc ErrorLoc) {
+bool WebAssemblyAsmTypeCheck::checkEnd(SMLoc ErrorLoc, bool PopVals) {
   if (LastSig.Returns.size() > Stack.size())
     return typeError(ErrorLoc, "end: insufficient values on the type stack");
+  
+  if (PopVals) {
+    for (auto VT : llvm::reverse(LastSig.Returns)) {
+      if (popType(ErrorLoc, VT)) 
+        return true;
+    }
+    return false;
+  }
+  
   for (size_t i = 0; i < LastSig.Returns.size(); i++) {
     auto EVT = LastSig.Returns[i];
     auto PVT = Stack[Stack.size() - LastSig.Returns.size() + i];
@@ -221,7 +230,7 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst) {
       return true;
   } else if (Name == "end_block" || Name == "end_loop" || Name == "end_if" ||
              Name == "else" || Name == "end_try") {
-    if (checkEnd(ErrorLoc))
+    if (checkEnd(ErrorLoc, Name == "else"))
       return true;
     if (Name == "end_block")
       Unreachable = false;
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
index aa35213ccca3..2b07faf67a18 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h
@@ -39,7 +39,7 @@ class WebAssemblyAsmTypeCheck final {
   bool typeError(SMLoc ErrorLoc, const Twine &Msg);
   bool popType(SMLoc ErrorLoc, Optional<wasm::ValType> EVT);
   bool getLocal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type);
-  bool checkEnd(SMLoc ErrorLoc);
+  bool checkEnd(SMLoc ErrorLoc, bool PopVals = false);
   bool checkSig(SMLoc ErrorLoc, const wasm::WasmSignature &Sig);
   bool getSymRef(SMLoc ErrorLoc, const MCInst &Inst,
                  const MCSymbolRefExpr *&SymRef);
diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 2e1e4f061219..5d38145559da 100644
--- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -241,28 +241,6 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
       }
       break;
     }
-    // heap_type operands, for e.g. ref.null:
-    case WebAssembly::OPERAND_HEAPTYPE: {
-      int64_t Val;
-      uint64_t PrevSize = Size;
-      if (!nextLEB(Val, Bytes, Size, true))
-        return MCDisassembler::Fail;
-      if (Val < 0 && Size == PrevSize + 1) {
-        // The HeapType encoding is like BlockType, in that encodings that
-        // decode as negative values indicate ValTypes.  In practice we expect
-        // either wasm::ValType::EXTERNREF or wasm::ValType::FUNCREF here.
-        //
-        // The positive SLEB values are reserved for future expansion and are
-        // expected to be type indices in the typed function references
-        // proposal, and should disassemble as MCSymbolRefExpr as in BlockType
-        // above.
-        MI.addOperand(MCOperand::createImm(Val & 0x7f));
-      } else {
-        MI.addOperand(
-            MCOperand::createImm(int64_t(WebAssembly::HeapType::Invalid)));
-      }
-      break;
-    }
     // FP operands.
     case WebAssembly::OPERAND_F32IMM: {
       if (!parseImmediate<float>(MI, Size, Bytes))
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index 2967aaa00ad4..d72bfdbbfb99 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -366,26 +366,3 @@ void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
     }
   }
 }
-
-void WebAssemblyInstPrinter::printWebAssemblyHeapTypeOperand(const MCInst *MI,
-                                                             unsigned OpNo,
-                                                             raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isImm()) {
-    switch (Op.getImm()) {
-    case long(wasm::ValType::EXTERNREF):
-      O << "extern";
-      break;
-    case long(wasm::ValType::FUNCREF):
-      O << "func";
-      break;
-    default:
-      O << "unsupported_heap_type_value";
-      break;
-    }
-  } else {
-    // Typed function references and other subtypes of funcref and externref
-    // currently unimplemented.
-    O << "unsupported_heap_type_operand";
-  }
-}
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index 7d980c78c3c9..fe104cbca12e 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -47,8 +47,6 @@ public:
                                       raw_ostream &O);
   void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O);
-  void printWebAssemblyHeapTypeOperand(const MCInst *MI, unsigned OpNo,
-                                       raw_ostream &O);
 
   // Autogenerated by tblgen.
   std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index c3d259e6ff20..d8122950e061 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssemblyMCAsmInfo.h"
+#include "Utils/WebAssemblyUtilities.h"
 #include "llvm/ADT/Triple.h"
 
 using namespace llvm;
@@ -44,5 +45,13 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T,
 
   SupportsDebugInformation = true;
 
+  // When compilation is done on a cpp file by clang, the exception model info
+  // is stored in LangOptions, which is later used to set the info in
+  // TargetOptions and then MCAsmInfo in LLVMTargetMachine::initAsmInfo(). But
+  // this process does not happen when compiling bitcode directly with clang, so
+  // we make sure this info is set correctly.
+  if (WebAssembly::WasmEnableEH || WebAssembly::WasmEnableSjLj)
+    ExceptionsType = ExceptionHandling::Wasm;
+
   // TODO: UseIntegratedAssembler?
 }
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index 4961c2ef9529..6e494b9430f7 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -106,9 +106,6 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
           encodeSLEB128(int64_t(MO.getImm()), OS);
           break;
         case WebAssembly::OPERAND_SIGNATURE:
-        case WebAssembly::OPERAND_HEAPTYPE:
-          OS << uint8_t(MO.getImm());
-          break;
         case WebAssembly::OPERAND_VEC_I8IMM:
           support::endian::write<uint8_t>(OS, MO.getImm(), support::little);
           break;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index d07bfce9abc1..b2f10ca93a4f 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -78,8 +78,6 @@ enum OperandType {
   OPERAND_BRLIST,
   /// 32-bit unsigned table number.
   OPERAND_TABLE,
-  /// heap type immediate for ref.null.
-  OPERAND_HEAPTYPE,
 };
 } // end namespace WebAssembly
 
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
index 6f81431bba2d..0412e524f800 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -41,13 +41,6 @@ Optional<wasm::ValType> WebAssembly::parseType(StringRef Type) {
   return Optional<wasm::ValType>();
 }
 
-WebAssembly::HeapType WebAssembly::parseHeapType(StringRef Type) {
-  return StringSwitch<WebAssembly::HeapType>(Type)
-      .Case("extern", WebAssembly::HeapType::Externref)
-      .Case("func", WebAssembly::HeapType::Funcref)
-      .Default(WebAssembly::HeapType::Invalid);
-}
-
 WebAssembly::BlockType WebAssembly::parseBlockType(StringRef Type) {
   // Multivalue block types are handled separately in parseSignature
   return StringSwitch<WebAssembly::BlockType>(Type)
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
index 8d757df27b34..042d51c7d6cb 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
@@ -41,17 +41,9 @@ enum class BlockType : unsigned {
   Multivalue = 0xffff,
 };
 
-/// Used as immediate MachineOperands for heap types, e.g. for ref.null.
-enum class HeapType : unsigned {
-  Invalid = 0x00,
-  Externref = unsigned(wasm::ValType::EXTERNREF),
-  Funcref = unsigned(wasm::ValType::FUNCREF),
-};
-
 // Convert StringRef to ValType / HealType / BlockType
 
 Optional<wasm::ValType> parseType(StringRef Type);
-HeapType parseHeapType(StringRef Type);
 BlockType parseBlockType(StringRef Type);
 MVT parseMVT(StringRef Type);
 
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp
index 3da80f4fc875..b87c884c9e4a 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp
@@ -18,6 +18,31 @@
 #include "llvm/MC/MCContext.h"
 using namespace llvm;
 
+// Exception handling & setjmp-longjmp handling related options. These are
+// defined here to be shared between WebAssembly and its subdirectories.
+
+// Emscripten's asm.js-style exception handling
+cl::opt<bool> WebAssembly::WasmEnableEmEH(
+    "enable-emscripten-cxx-exceptions",
+    cl::desc("WebAssembly Emscripten-style exception handling"),
+    cl::init(false));
+// Emscripten's asm.js-style setjmp/longjmp handling
+cl::opt<bool> WebAssembly::WasmEnableEmSjLj(
+    "enable-emscripten-sjlj",
+    cl::desc("WebAssembly Emscripten-style setjmp/longjmp handling"),
+    cl::init(false));
+// Exception handling using wasm EH instructions
+cl::opt<bool>
+    WebAssembly::WasmEnableEH("wasm-enable-eh",
+                              cl::desc("WebAssembly exception handling"),
+                              cl::init(false));
+// setjmp/longjmp handling using wasm EH instrutions
+cl::opt<bool>
+    WebAssembly::WasmEnableSjLj("wasm-enable-sjlj",
+                                cl::desc("WebAssembly setjmp/longjmp handling"),
+                                cl::init(false));
+
+// Function names in libc++abi and libunwind
 const char *const WebAssembly::CxaBeginCatchFn = "__cxa_begin_catch";
 const char *const WebAssembly::CxaRethrowFn = "__cxa_rethrow";
 const char *const WebAssembly::StdTerminateFn = "_ZSt9terminatev";
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
index f6e96d9b2877..d024185defb4 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
@@ -16,6 +16,7 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYUTILITIES_H
 
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/CommandLine.h"
 
 namespace llvm {
 
@@ -70,6 +71,12 @@ inline bool isRefType(const Type *Ty) {
 bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
 bool mayThrow(const MachineInstr &MI);
 
+// Exception handling / setjmp-longjmp handling command-line options
+extern cl::opt<bool> WasmEnableEmEH;   // asm.js-style EH
+extern cl::opt<bool> WasmEnableEmSjLj; // asm.js-style SjLJ
+extern cl::opt<bool> WasmEnableEH;     // EH using Wasm EH instructions
+extern cl::opt<bool> WasmEnableSjLj;   // SjLj using Wasm EH instructions
+
 // Exception-related function names
 extern const char *const ClangCallTerminateFn;
 extern const char *const CxaBeginCatchFn;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 0d3f51693261..e3af6b2662ef 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -51,8 +51,6 @@ using namespace llvm;
 #define DEBUG_TYPE "asm-printer"
 
 extern cl::opt<bool> WasmKeepRegisters;
-extern cl::opt<bool> WasmEnableEmEH;
-extern cl::opt<bool> WasmEnableEmSjLj;
 
 //===----------------------------------------------------------------------===//
 // Helpers.
@@ -196,6 +194,13 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
     Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), Mutable});
   }
 
+  // If the GlobalVariable refers to a table, we handle it here instead of
+  // in emitExternalDecls
+  if (Sym->isTable()) {
+    getTargetStreamer()->emitTableType(Sym);
+    return;
+  }
+
   emitVisibility(Sym, GV->getVisibility(), !GV->isDeclaration());
   if (GV->hasInitializer()) {
     assert(getSymbolPreferLocal(*GV) == Sym);
@@ -315,8 +320,9 @@ void WebAssemblyAsmPrinter::emitExternalDecls(const Module &M) {
       // will discard it later if it turns out not to be necessary.
       auto Signature = signatureFromMVTs(Results, Params);
       bool InvokeDetected = false;
-      auto *Sym = getMCSymbolForFunction(&F, WasmEnableEmEH || WasmEnableEmSjLj,
-                                         Signature.get(), InvokeDetected);
+      auto *Sym = getMCSymbolForFunction(
+          &F, WebAssembly::WasmEnableEmEH || WebAssembly::WasmEnableEmSjLj,
+          Signature.get(), InvokeDetected);
 
       // Multiple functions can be mapped to the same invoke symbol. For
       // example, two IR functions '__invoke_void_i8*' and '__invoke_void_i32'
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 7832f199a2cc..17e867e4c7d8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -1741,7 +1741,7 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
 
 void WebAssemblyCFGStackify::cleanupFunctionData(MachineFunction &MF) {
   if (FakeCallerBB)
-    MF.DeleteMachineBasicBlock(FakeCallerBB);
+    MF.deleteMachineBasicBlock(FakeCallerBB);
   AppendixBB = FakeCallerBB = nullptr;
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index 1fa0ea3867c7..a3a33f4a5b3a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -31,6 +31,7 @@ HANDLE_NODETYPE(SWIZZLE)
 HANDLE_NODETYPE(VEC_SHL)
 HANDLE_NODETYPE(VEC_SHR_S)
 HANDLE_NODETYPE(VEC_SHR_U)
+HANDLE_NODETYPE(NARROW_U)
 HANDLE_NODETYPE(EXTEND_LOW_S)
 HANDLE_NODETYPE(EXTEND_LOW_U)
 HANDLE_NODETYPE(EXTEND_HIGH_S)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 0df8f3e0e09c..38ed4c73fb93 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -176,6 +176,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setTargetDAGCombine(ISD::FP_ROUND);
     setTargetDAGCombine(ISD::CONCAT_VECTORS);
 
+    setTargetDAGCombine(ISD::TRUNCATE);
+
     // Support saturating add for i8x16 and i16x8
     for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
       for (auto T : {MVT::v16i8, MVT::v8i16})
@@ -644,8 +646,7 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB,
     Register RegFuncref =
         MF.getRegInfo().createVirtualRegister(&WebAssembly::FUNCREFRegClass);
     MachineInstr *RefNull =
-        BuildMI(MF, DL, TII.get(WebAssembly::REF_NULL_FUNCREF), RegFuncref)
-            .addImm(static_cast<int32_t>(WebAssembly::HeapType::Funcref));
+        BuildMI(MF, DL, TII.get(WebAssembly::REF_NULL_FUNCREF), RegFuncref);
     BB->insertAfter(Const0->getIterator(), RefNull);
 
     MachineInstr *TableSet =
@@ -2610,6 +2611,114 @@ performVectorTruncZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   return DAG.getNode(Op, SDLoc(N), ResVT, Source);
 }
 
+// Helper to extract VectorWidth bits from Vec, starting from IdxVal.
+static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
+                                const SDLoc &DL, unsigned VectorWidth) {
+  EVT VT = Vec.getValueType();
+  EVT ElVT = VT.getVectorElementType();
+  unsigned Factor = VT.getSizeInBits() / VectorWidth;
+  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+                                  VT.getVectorNumElements() / Factor);
+
+  // Extract the relevant VectorWidth bits.  Generate an EXTRACT_SUBVECTOR
+  unsigned ElemsPerChunk = VectorWidth / ElVT.getSizeInBits();
+  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+  // This is the index of the first element of the VectorWidth-bit chunk
+  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+  IdxVal &= ~(ElemsPerChunk - 1);
+
+  // If the input is a buildvector just emit a smaller one.
+  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+    return DAG.getBuildVector(ResultVT, DL,
+                              Vec->ops().slice(IdxVal, ElemsPerChunk));
+
+  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, DL);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResultVT, Vec, VecIdx);
+}
+
+// Helper to recursively truncate vector elements in half with NARROW_U. DstVT
+// is the expected destination value type after recursion. In is the initial
+// input. Note that the input should have enough leading zero bits to prevent
+// NARROW_U from saturating results.
+static SDValue truncateVectorWithNARROW(EVT DstVT, SDValue In, const SDLoc &DL,
+                                        SelectionDAG &DAG) {
+  EVT SrcVT = In.getValueType();
+
+  // No truncation required, we might get here due to recursive calls.
+  if (SrcVT == DstVT)
+    return In;
+
+  unsigned SrcSizeInBits = SrcVT.getSizeInBits();
+  unsigned NumElems = SrcVT.getVectorNumElements();
+  if (!isPowerOf2_32(NumElems))
+    return SDValue();
+  assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
+  assert(SrcSizeInBits > DstVT.getSizeInBits() && "Illegal truncation");
+
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
+
+  // Narrow to the largest type possible:
+  // vXi64/vXi32 -> i16x8.narrow_i32x4_u and vXi16 -> i8x16.narrow_i16x8_u.
+  EVT InVT = MVT::i16, OutVT = MVT::i8;
+  if (SrcVT.getScalarSizeInBits() > 16) {
+    InVT = MVT::i32;
+    OutVT = MVT::i16;
+  }
+  unsigned SubSizeInBits = SrcSizeInBits / 2;
+  InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
+  OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
+
+  // Split lower/upper subvectors.
+  SDValue Lo = extractSubVector(In, 0, DAG, DL, SubSizeInBits);
+  SDValue Hi = extractSubVector(In, NumElems / 2, DAG, DL, SubSizeInBits);
+
+  // 256bit -> 128bit truncate - Narrow lower/upper 128-bit subvectors.
+  if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
+    Lo = DAG.getBitcast(InVT, Lo);
+    Hi = DAG.getBitcast(InVT, Hi);
+    SDValue Res = DAG.getNode(WebAssemblyISD::NARROW_U, DL, OutVT, Lo, Hi);
+    return DAG.getBitcast(DstVT, Res);
+  }
+
+  // Recursively narrow lower/upper subvectors, concat result and narrow again.
+  EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
+  Lo = truncateVectorWithNARROW(PackedVT, Lo, DL, DAG);
+  Hi = truncateVectorWithNARROW(PackedVT, Hi, DL, DAG);
+
+  PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
+  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
+  return truncateVectorWithNARROW(DstVT, Res, DL, DAG);
+}
+
+static SDValue performTruncateCombine(SDNode *N,
+                                      TargetLowering::DAGCombinerInfo &DCI) {
+  auto &DAG = DCI.DAG;
+
+  SDValue In = N->getOperand(0);
+  EVT InVT = In.getValueType();
+  if (!InVT.isSimple())
+    return SDValue();
+
+  EVT OutVT = N->getValueType(0);
+  if (!OutVT.isVector())
+    return SDValue();
+
+  EVT OutSVT = OutVT.getVectorElementType();
+  EVT InSVT = InVT.getVectorElementType();
+  // Currently only cover truncate to v16i8 or v8i16.
+  if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
+        (OutSVT == MVT::i8 || OutSVT == MVT::i16) && OutVT.is128BitVector()))
+    return SDValue();
+
+  SDLoc DL(N);
+  APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
+                                    OutVT.getScalarSizeInBits());
+  In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
+  return truncateVectorWithNARROW(OutVT, In, DL, DAG);
+}
+
 SDValue
 WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -2626,5 +2735,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_ROUND:
   case ISD::CONCAT_VECTORS:
     return performVectorTruncZeroCombine(N, DCI);
+  case ISD::TRUNCATE:
+    return performTruncateCombine(N, DCI);
   }
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index ee9247a8bef9..3fb0af1d47a0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -202,11 +202,6 @@ def Signature : Operand<i32> {
   let PrintMethod = "printWebAssemblySignatureOperand";
 }
 
-let OperandType = "OPERAND_HEAPTYPE" in
-def HeapType : Operand<i32> {
-  let PrintMethod = "printWebAssemblyHeapTypeOperand";
-}
-
 let OperandType = "OPERAND_TYPEINDEX" in
 def TypeIndex : Operand<i32>;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index ef9bd35d004a..76a88caafc47 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -11,13 +11,14 @@
 ///
 //===----------------------------------------------------------------------===//
 
-multiclass REF_I<WebAssemblyRegClass rc, ValueType vt> {
-  defm REF_NULL_#rc : I<(outs rc:$res), (ins HeapType:$heaptype),
-                        (outs), (ins HeapType:$heaptype),
-                        [],
-                        "ref.null\t$res, $heaptype",
-                        "ref.null\t$heaptype",
-                        0xd0>,
+multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> {
+  defm REF_NULL_#rc : I<(outs rc:$dst), (ins),
+                        (outs), (ins),
+                        [(set rc:$dst, (!cast<Intrinsic>("int_wasm_ref_null_" # ht)))],
+                        "ref.null_" # ht # "$dst",
+                        "ref.null_" # ht,
+                        !cond(!eq(ht, "func")   : 0xd070, 
+                              !eq(ht, "extern") : 0xd06f)>,
                       Requires<[HasReferenceTypes]>;
   defm SELECT_#rc: I<(outs rc:$dst), (ins rc:$lhs, rc:$rhs, I32:$cond),
                      (outs), (ins),
@@ -28,8 +29,8 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt> {
                    Requires<[HasReferenceTypes]>;
 }
 
-defm "" : REF_I<FUNCREF, funcref>;
-defm "" : REF_I<EXTERNREF, externref>;
+defm "" : REF_I<FUNCREF, funcref, "func">;
+defm "" : REF_I<EXTERNREF, externref, "extern">;
 
 foreach rc = [FUNCREF, EXTERNREF] in {
 def : Pat<(select (i32 (setne I32:$cond, 0)), rc:$lhs, rc:$rhs),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 30b99c3a69a9..5bb12c7fbdc7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1278,6 +1278,14 @@ multiclass SIMDNarrow<Vec vec, bits<32> baseInst> {
 defm "" : SIMDNarrow<I16x8, 101>;
 defm "" : SIMDNarrow<I32x4, 133>;
 
+// WebAssemblyISD::NARROW_U
+def wasm_narrow_t : SDTypeProfile<1, 2, []>;
+def wasm_narrow_u : SDNode<"WebAssemblyISD::NARROW_U", wasm_narrow_t>;
+def : Pat<(v16i8 (wasm_narrow_u (v8i16 V128:$left), (v8i16 V128:$right))),
+          (NARROW_U_I8x16 $left, $right)>;
+def : Pat<(v8i16 (wasm_narrow_u (v4i32 V128:$left), (v4i32 V128:$right))),
+          (NARROW_U_I16x8 $left, $right)>;
+
 // Bitcasts are nops
 // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types
 foreach t1 = AllVecs in
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
index e44c2073eaeb..1fd00bf1cbc8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
@@ -20,7 +20,7 @@ def WebAssemblyTableGet : SDNode<"WebAssemblyISD::TABLE_GET", WebAssemblyTableGe
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
 
-multiclass TABLE<WebAssemblyRegClass rc> {
+multiclass TABLE<WebAssemblyRegClass rc, string suffix> {
   let mayLoad = 1 in
   defm TABLE_GET_#rc : I<(outs rc:$res), (ins table32_op:$table, I32:$i),
                          (outs), (ins table32_op:$table),
@@ -39,14 +39,14 @@ multiclass TABLE<WebAssemblyRegClass rc> {
 
   defm TABLE_GROW_#rc : I<(outs I32:$sz), (ins table32_op:$table, rc:$val, I32:$n),
                           (outs), (ins table32_op:$table),
-                          [],
+                          [(set I32:$sz, (!cast<Intrinsic>("int_wasm_table_grow_" # suffix) (WebAssemblyWrapper tglobaladdr:$table), rc:$val, I32:$n))],
                           "table.grow\t$sz, $table, $val, $n",
                           "table.grow\t$table",
                           0xfc0f>;
 
   defm TABLE_FILL_#rc : I<(outs), (ins table32_op:$table, I32:$i, rc:$val, I32:$n),
                           (outs), (ins table32_op:$table),
-                          [],
+                          [(!cast<Intrinsic>("int_wasm_table_fill_" # suffix) (WebAssemblyWrapper tglobaladdr:$table), I32:$i, rc:$val, I32:$n)],
                           "table.fill\t$table, $i, $val, $n",
                           "table.fill\t$table",
                           0xfc11>;
@@ -62,8 +62,8 @@ multiclass TABLE<WebAssemblyRegClass rc> {
   }
 }
 
-defm "" : TABLE<FUNCREF>, Requires<[HasReferenceTypes]>;
-defm "" : TABLE<EXTERNREF>, Requires<[HasReferenceTypes]>;
+defm "" : TABLE<FUNCREF, "funcref">, Requires<[HasReferenceTypes]>;
+defm "" : TABLE<EXTERNREF, "externref">, Requires<[HasReferenceTypes]>;
 
 def : Pat<(WebAssemblyTableSet mcsym:$table, i32:$idx, funcref:$r),
           (TABLE_SET_FUNCREF mcsym:$table, i32:$idx, funcref:$r)>,
@@ -71,7 +71,7 @@ def : Pat<(WebAssemblyTableSet mcsym:$table, i32:$idx, funcref:$r),
 
 defm TABLE_SIZE : I<(outs I32:$sz), (ins table32_op:$table),
                     (outs), (ins table32_op:$table),
-                    [],
+                    [(set I32:$sz, (int_wasm_table_size (WebAssemblyWrapper tglobaladdr:$table)))],
                     "table.size\t$sz, $table",
                     "table.size\t$table",
                     0xfc10>,
@@ -80,7 +80,9 @@ defm TABLE_SIZE : I<(outs I32:$sz), (ins table32_op:$table),
 
 defm TABLE_COPY : I<(outs), (ins table32_op:$table1, table32_op:$table2, I32:$d, I32:$s, I32:$n),
                     (outs), (ins table32_op:$table1, table32_op:$table2),
-                    [],
+                    [(int_wasm_table_copy (WebAssemblyWrapper tglobaladdr:$table1),
+                                          (WebAssemblyWrapper tglobaladdr:$table2),
+                                          I32:$d, I32:$s, I32:$n)],
                     "table.copy\t$table1, $table2, $d, $s, $n",
                     "table.copy\t$table1, $table2",
                     0xfc0e>,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 4eacc921b6cd..23aaa5160abd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -267,6 +267,7 @@
 ///
 ///===----------------------------------------------------------------------===//
 
+#include "Utils/WebAssemblyUtilities.h"
 #include "WebAssembly.h"
 #include "WebAssemblyTargetMachine.h"
 #include "llvm/ADT/StringExtras.h"
@@ -285,13 +286,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower-em-ehsjlj"
 
-// Emscripten's asm.js-style exception handling
-extern cl::opt<bool> WasmEnableEmEH;
-// Emscripten's asm.js-style setjmp/longjmp handling
-extern cl::opt<bool> WasmEnableEmSjLj;
-// Wasm setjmp/longjmp handling using wasm EH instructions
-extern cl::opt<bool> WasmEnableSjLj;
-
 static cl::list<std::string>
     EHAllowlist("emscripten-cxx-exceptions-allowed",
                 cl::desc("The list of function names in which Emscripten-style "
@@ -370,8 +364,9 @@ public:
   static char ID;
 
   WebAssemblyLowerEmscriptenEHSjLj()
-      : ModulePass(ID), EnableEmEH(WasmEnableEmEH),
-        EnableEmSjLj(WasmEnableEmSjLj), EnableWasmSjLj(WasmEnableSjLj) {
+      : ModulePass(ID), EnableEmEH(WebAssembly::WasmEnableEmEH),
+        EnableEmSjLj(WebAssembly::WasmEnableEmSjLj),
+        EnableWasmSjLj(WebAssembly::WasmEnableSjLj) {
     assert(!(EnableEmSjLj && EnableWasmSjLj) &&
            "Two SjLj modes cannot be turned on at the same time");
     assert(!(EnableEmEH && EnableWasmSjLj) &&
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 0b953a90aeab..09bccef17ab0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -40,9 +40,6 @@ cl::opt<bool>
                                " instruction output for test purposes only."),
                       cl::init(false));
 
-extern cl::opt<bool> WasmEnableEmEH;
-extern cl::opt<bool> WasmEnableEmSjLj;
-
 static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI);
 
 MCSymbol *
@@ -66,9 +63,11 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
       // they reach this point as aggregate Array types with an element type
       // that is a reference type.
       wasm::ValType Type;
+      bool IsTable = false;
       if (GlobalVT->isArrayTy() &&
           WebAssembly::isRefType(GlobalVT->getArrayElementType())) {
         MVT VT;
+        IsTable = true;
         switch (GlobalVT->getArrayElementType()->getPointerAddressSpace()) {
         case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF:
           VT = MVT::funcref;
@@ -85,9 +84,14 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
       } else
         report_fatal_error("Aggregate globals not yet implemented");
 
-      WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
-      WasmSym->setGlobalType(
-          wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true});
+      if (IsTable) {
+        WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE);
+        WasmSym->setTableType(Type);
+      } else {
+        WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+        WasmSym->setGlobalType(
+            wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true});
+      }
     }
     return WasmSym;
   }
@@ -105,7 +109,8 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
 
   bool InvokeDetected = false;
   auto *WasmSym = Printer.getMCSymbolForFunction(
-      F, WasmEnableEmEH || WasmEnableEmSjLj, Signature.get(), InvokeDetected);
+      F, WebAssembly::WasmEnableEmEH || WebAssembly::WasmEnableEmSjLj,
+      Signature.get(), InvokeDetected);
   WasmSym->setSignature(Signature.get());
   Printer.addSignature(std::move(Signature));
   WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
@@ -275,11 +280,6 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
                                          SmallVector<wasm::ValType, 4>());
             break;
           }
-        } else if (Info.OperandType == WebAssembly::OPERAND_HEAPTYPE) {
-          assert(static_cast<WebAssembly::HeapType>(MO.getImm()) !=
-                 WebAssembly::HeapType::Invalid);
-          // With typed function references, this will need a case for type
-          // index operands.  Otherwise, fall through.
         }
       }
       MCOp = MCOperand::createImm(MO.getImm());
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 7b70d99b5f52..482837178f3d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -14,6 +14,7 @@
 #include "WebAssemblyTargetMachine.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "TargetInfo/WebAssemblyTargetInfo.h"
+#include "Utils/WebAssemblyUtilities.h"
 #include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblyTargetObjectFile.h"
@@ -24,6 +25,7 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
@@ -33,28 +35,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm"
 
-// Emscripten's asm.js-style exception handling
-cl::opt<bool>
-    WasmEnableEmEH("enable-emscripten-cxx-exceptions",
-                   cl::desc("WebAssembly Emscripten-style exception handling"),
-                   cl::init(false));
-
-// Emscripten's asm.js-style setjmp/longjmp handling
-cl::opt<bool> WasmEnableEmSjLj(
-    "enable-emscripten-sjlj",
-    cl::desc("WebAssembly Emscripten-style setjmp/longjmp handling"),
-    cl::init(false));
-
-// Exception handling using wasm EH instructions
-cl::opt<bool> WasmEnableEH("wasm-enable-eh",
-                           cl::desc("WebAssembly exception handling"),
-                           cl::init(false));
-
-// setjmp/longjmp handling using wasm EH instrutions
-cl::opt<bool> WasmEnableSjLj("wasm-enable-sjlj",
-                             cl::desc("WebAssembly setjmp/longjmp handling"),
-                             cl::init(false));
-
 // A command-line option to keep implicit locals
 // for the purpose of testing with lit/llc ONLY.
 // This produces output which is not valid WebAssembly, and is not supported
@@ -368,7 +348,23 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
   return nullptr; // No reg alloc
 }
 
-static void basicCheckForEHAndSjLj(const TargetMachine *TM) {
+using WebAssembly::WasmEnableEH;
+using WebAssembly::WasmEnableEmEH;
+using WebAssembly::WasmEnableEmSjLj;
+using WebAssembly::WasmEnableSjLj;
+
+static void basicCheckForEHAndSjLj(TargetMachine *TM) {
+  // Before checking, we make sure TargetOptions.ExceptionModel is the same as
+  // MCAsmInfo.ExceptionsType. Normally these have to be the same, because clang
+  // stores the exception model info in LangOptions, which is later transferred
+  // to TargetOptions and MCAsmInfo. But when clang compiles bitcode directly,
+  // clang's LangOptions is not used and thus the exception model info is not
+  // correctly transferred to TargetOptions and MCAsmInfo, so we make sure we
+  // have the correct exception model in in WebAssemblyMCAsmInfo constructor.
+  // But in this case TargetOptions is still not updated, so we make sure they
+  // are the same.
+  TM->Options.ExceptionModel = TM->getMCAsmInfo()->getExceptionHandlingType();
+
   // Basic Correctness checking related to -exception-model
   if (TM->Options.ExceptionModel != ExceptionHandling::None &&
       TM->Options.ExceptionModel != ExceptionHandling::Wasm)
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 8ce6b47d10e8..2ba0b97229cc 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1759,7 +1759,8 @@ bool X86AsmParser::CreateMemForMSInlineAsm(
   // registers in a mmory expression, and though unaccessible via rip/eip.
   if (IsGlobalLV && (BaseReg || IndexReg)) {
     Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start,
-                                             End, Size, Identifier, Decl));
+                                             End, Size, Identifier, Decl,
+                                             FrontendSize));
     return false;
   }
   // Otherwise, we set the base register to a non-zero value
@@ -2551,8 +2552,6 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
   StringRef ErrMsg;
   unsigned BaseReg = SM.getBaseReg();
   unsigned IndexReg = SM.getIndexReg();
-  if (IndexReg && BaseReg == X86::RIP)
-    BaseReg = 0;
   unsigned Scale = SM.getScale();
   if (!PtrInOperand)
     Size = SM.getElementSize() << 3;
@@ -4430,8 +4429,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
   // If exactly one matched, then we treat that as a successful match (and the
   // instruction will already have been filled in correctly, since the failing
   // matches won't have modified it).
-  unsigned NumSuccessfulMatches =
-      std::count(std::begin(Match), std::end(Match), Match_Success);
+  unsigned NumSuccessfulMatches = llvm::count(Match, Match_Success);
   if (NumSuccessfulMatches == 1) {
     if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
       return true;
@@ -4479,7 +4477,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
 
   // If all of the instructions reported an invalid mnemonic, then the original
   // mnemonic was invalid.
-  if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) {
+  if (llvm::count(Match, Match_MnemonicFail) == 4) {
     if (OriginalError == Match_MnemonicFail)
       return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
                    Op.getLocRange(), MatchingInlineAsm);
@@ -4508,16 +4506,14 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
   }
 
   // If one instruction matched as unsupported, report this as unsupported.
-  if (std::count(std::begin(Match), std::end(Match),
-                 Match_Unsupported) == 1) {
+  if (llvm::count(Match, Match_Unsupported) == 1) {
     return Error(IDLoc, "unsupported instruction", EmptyRange,
                  MatchingInlineAsm);
   }
 
   // If one instruction matched with a missing feature, report this as a
   // missing feature.
-  if (std::count(std::begin(Match), std::end(Match),
-                 Match_MissingFeature) == 1) {
+  if (llvm::count(Match, Match_MissingFeature) == 1) {
     ErrorInfo = Match_MissingFeature;
     return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures,
                                MatchingInlineAsm);
@@ -4525,8 +4521,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
 
   // If one instruction matched with an invalid operand, report this as an
   // operand failure.
-  if (std::count(std::begin(Match), std::end(Match),
-                 Match_InvalidOperand) == 1) {
+  if (llvm::count(Match, Match_InvalidOperand) == 1) {
     return Error(IDLoc, "invalid operand for instruction", EmptyRange,
                  MatchingInlineAsm);
   }
@@ -4674,8 +4669,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
                  Op.getLocRange(), MatchingInlineAsm);
   }
 
-  unsigned NumSuccessfulMatches =
-      std::count(std::begin(Match), std::end(Match), Match_Success);
+  unsigned NumSuccessfulMatches = llvm::count(Match, Match_Success);
 
   // If matching was ambiguous and we had size information from the frontend,
   // try again with that. This handles cases like "movxz eax, m8/m16".
@@ -4721,16 +4715,14 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
   }
 
   // If one instruction matched as unsupported, report this as unsupported.
-  if (std::count(std::begin(Match), std::end(Match),
-                 Match_Unsupported) == 1) {
+  if (llvm::count(Match, Match_Unsupported) == 1) {
     return Error(IDLoc, "unsupported instruction", EmptyRange,
                  MatchingInlineAsm);
   }
 
   // If one instruction matched with a missing feature, report this as a
   // missing feature.
-  if (std::count(std::begin(Match), std::end(Match),
-                 Match_MissingFeature) == 1) {
+  if (llvm::count(Match, Match_MissingFeature) == 1) {
     ErrorInfo = Match_MissingFeature;
     return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures,
                                MatchingInlineAsm);
@@ -4738,14 +4730,12 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
 
   // If one instruction matched with an invalid operand, report this as an
   // operand failure.
-  if (std::count(std::begin(Match), std::end(Match),
-                 Match_InvalidOperand) == 1) {
+  if (llvm::count(Match, Match_InvalidOperand) == 1) {
     return Error(IDLoc, "invalid operand for instruction", EmptyRange,
                  MatchingInlineAsm);
   }
 
-  if (std::count(std::begin(Match), std::end(Match),
-                 Match_InvalidImmUnsignedi4) == 1) {
+  if (llvm::count(Match, Match_InvalidImmUnsignedi4) == 1) {
     SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc())
       ErrorLoc = IDLoc;
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 9164c699b569..67b1244708a8 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -285,6 +285,12 @@ struct X86Operand final : public MCParsedAsmOperand {
 
   bool isOffsetOfLocal() const override { return isImm() && Imm.LocalRef; }
 
+  bool isMemPlaceholder(const MCInstrDesc &Desc) const override {
+    // Only MS InlineAsm uses global variables with registers rather than
+    // rip/eip.
+    return isMem() && !Mem.DefaultBaseReg && Mem.FrontendSize;
+  }
+
   bool needAddressOf() const override { return AddressOf; }
 
   bool isMem() const override { return Kind == Memory; }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index b51011e2c52f..a903c5f455a2 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -948,39 +948,39 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   CASE_UNPCK(PUNPCKHBW, r)
-  case X86::MMX_PUNPCKHBWirr:
+  case X86::MMX_PUNPCKHBWrr:
     Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     RegForm = true;
     LLVM_FALLTHROUGH;
 
   CASE_UNPCK(PUNPCKHBW, m)
-  case X86::MMX_PUNPCKHBWirm:
+  case X86::MMX_PUNPCKHBWrm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKHMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKHWD, r)
-  case X86::MMX_PUNPCKHWDirr:
+  case X86::MMX_PUNPCKHWDrr:
     Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     RegForm = true;
     LLVM_FALLTHROUGH;
 
   CASE_UNPCK(PUNPCKHWD, m)
-  case X86::MMX_PUNPCKHWDirm:
+  case X86::MMX_PUNPCKHWDrm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKHMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKHDQ, r)
-  case X86::MMX_PUNPCKHDQirr:
+  case X86::MMX_PUNPCKHDQrr:
     Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     RegForm = true;
     LLVM_FALLTHROUGH;
 
   CASE_UNPCK(PUNPCKHDQ, m)
-  case X86::MMX_PUNPCKHDQirm:
+  case X86::MMX_PUNPCKHDQrm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
@@ -998,39 +998,39 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   CASE_UNPCK(PUNPCKLBW, r)
-  case X86::MMX_PUNPCKLBWirr:
+  case X86::MMX_PUNPCKLBWrr:
     Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     RegForm = true;
     LLVM_FALLTHROUGH;
 
   CASE_UNPCK(PUNPCKLBW, m)
-  case X86::MMX_PUNPCKLBWirm:
+  case X86::MMX_PUNPCKLBWrm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKLMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKLWD, r)
-  case X86::MMX_PUNPCKLWDirr:
+  case X86::MMX_PUNPCKLWDrr:
     Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     RegForm = true;
     LLVM_FALLTHROUGH;
 
   CASE_UNPCK(PUNPCKLWD, m)
-  case X86::MMX_PUNPCKLWDirm:
+  case X86::MMX_PUNPCKLWDrm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKLMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask);
     break;
 
   CASE_UNPCK(PUNPCKLDQ, r)
-  case X86::MMX_PUNPCKLDQirr:
+  case X86::MMX_PUNPCKLDQrr:
     Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     RegForm = true;
     LLVM_FALLTHROUGH;
 
   CASE_UNPCK(PUNPCKLDQ, m)
-  case X86::MMX_PUNPCKLDQirm:
+  case X86::MMX_PUNPCKLDQrm:
     Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index 11251fb2b2ba..bf3f4e990ecc 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -236,7 +236,7 @@ bool X86WinCOFFTargetStreamer::emitFPOStackAlloc(unsigned StackAlloc, SMLoc L) {
 bool X86WinCOFFTargetStreamer::emitFPOStackAlign(unsigned Align, SMLoc L) {
   if (checkInFPOPrologue(L))
     return true;
-  if (!llvm::any_of(CurFPOData->Instructions, [](const FPOInstruction &Inst) {
+  if (llvm::none_of(CurFPOData->Instructions, [](const FPOInstruction &Inst) {
         return Inst.Op == FPOInstruction::SetFrame;
       })) {
     getContext().reportError(
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 2e08482e4ff6..d48b8e458219 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -754,8 +754,6 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
 void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
 
-  emitAsanMemaccessSymbols(M);
-
   if (TT.isOSBinFormatMachO()) {
     // Mach-O uses non-lazy symbol stubs to encode per-TU information into
     // global table for symbol lookup.
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index 3b0983a7d935..b22f25af26cf 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -31,6 +31,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   FaultMaps FM;
   std::unique_ptr<MCCodeEmitter> CodeEmitter;
   bool EmitFPOData = false;
+  bool ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = false;
 
   // This utility class tracks the length of a stackmap instruction's 'shadow'.
   // It is used by the X86AsmPrinter to ensure that the stackmap shadow
@@ -100,20 +101,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
 
   // Address sanitizer specific lowering for X86.
   void LowerASAN_CHECK_MEMACCESS(const MachineInstr &MI);
-  void emitAsanMemaccessSymbols(Module &M);
-  void emitAsanMemaccessPartial(Module &M, unsigned Reg,
-                                const ASanAccessInfo &AccessInfo,
-                                MCSubtargetInfo &STI);
-  void emitAsanMemaccessFull(Module &M, unsigned Reg,
-                             const ASanAccessInfo &AccessInfo,
-                             MCSubtargetInfo &STI);
-  void emitAsanReportError(Module &M, unsigned Reg,
-                           const ASanAccessInfo &AccessInfo,
-                           MCSubtargetInfo &STI);
-
-  typedef std::tuple<unsigned /*Reg*/, uint32_t /*AccessInfo*/>
-      AsanMemaccessTuple;
-  std::map<AsanMemaccessTuple, MCSymbol *> AsanMemaccessSymbols;
 
   // Choose between emitting .seh_ directives and .cv_fpo_ directives.
   void EmitSEHInstruction(const MachineInstr *MI);
@@ -165,6 +152,10 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
   void emitFunctionBodyStart() override;
   void emitFunctionBodyEnd() override;
+
+  bool shouldEmitWeakSwiftAsyncExtendedFramePointerFlags() const override {
+    return ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86CmovConversion.cpp b/llvm/lib/Target/X86/X86CmovConversion.cpp
index 863438793acf..96d3d1390a59 100644
--- a/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -186,7 +186,7 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
     if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
       for (auto &Group : AllCmovGroups) {
         // Skip any group that doesn't do at least one memory operand cmov.
-        if (!llvm::any_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
+        if (llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
           continue;
 
         // For CMOV groups which we can rewrite and which contain a memory load,
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 93bc23006dc4..6a047838f0b5 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -191,8 +191,6 @@ void X86ExpandPseudo::expandCALL_RVMARKER(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MBBI) {
   // Expand CALL_RVMARKER pseudo to call instruction, followed by the special
   //"movq %rax, %rdi" marker.
-  // TODO: Mark the sequence as bundle, to avoid passes moving other code
-  // in between.
   MachineInstr &MI = *MBBI;
 
   MachineInstr *OriginalCall;
@@ -236,15 +234,23 @@ void X86ExpandPseudo::expandCALL_RVMARKER(MachineBasicBlock &MBB,
   // Emit call to ObjC runtime.
   const uint32_t *RegMask =
       TRI->getCallPreservedMask(*MBB.getParent(), CallingConv::C);
-  BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(X86::CALL64pcrel32))
-      .addGlobalAddress(MI.getOperand(0).getGlobal(), 0, 0)
-      .addRegMask(RegMask)
-      .addReg(X86::RAX,
-              RegState::Implicit |
-                  (RAXImplicitDead ? (RegState::Dead | RegState::Define)
-                                   : RegState::Define))
-      .getInstr();
+  MachineInstr *RtCall =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(X86::CALL64pcrel32))
+          .addGlobalAddress(MI.getOperand(0).getGlobal(), 0, 0)
+          .addRegMask(RegMask)
+          .addReg(X86::RAX,
+                  RegState::Implicit |
+                      (RAXImplicitDead ? (RegState::Dead | RegState::Define)
+                                       : RegState::Define))
+          .getInstr();
   MI.eraseFromParent();
+
+  auto &TM = MBB.getParent()->getTarget();
+  // On Darwin platforms, wrap the expanded sequence in a bundle to prevent
+  // later optimizations from breaking up the sequence.
+  if (TM.getTargetTriple().isOSDarwin())
+    finalizeBundle(MBB, OriginalCall->getIterator(),
+                   std::next(RtCall->getIterator()));
 }
 
 /// If \p MBBI is a pseudo instruction, this method expands
diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
index 87c04a07cd13..47874e82ff3b 100644
--- a/llvm/lib/Target/X86/X86FastTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -134,11 +134,7 @@ bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) {
   if (MI.getOpcode() == X86::PLDTILECFGV || MI.isDebugInstr())
     return false;
 
-  for (MachineOperand &MO : MI.operands())
-    if (isTilePhysReg(MO))
-      return true;
-
-  return false;
+  return llvm::any_of(MI.operands(), isTilePhysReg);
 }
 
 MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) {
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index e1d4b4c34772..16bff201dd03 100644
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -457,14 +457,12 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
   OptForSize = MF.getFunction().hasOptSize() ||
                llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
 
-  for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
-    MachineInstr *MI = &*I;
-
-    if (MachineInstr *NewMI = tryReplaceInstr(MI, MBB))
-      MIReplacements.push_back(std::make_pair(MI, NewMI));
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
+    if (MachineInstr *NewMI = tryReplaceInstr(&MI, MBB))
+      MIReplacements.push_back(std::make_pair(&MI, NewMI));
 
     // We're done with this instruction, update liveness for the next one.
-    LiveRegs.stepBackward(*MI);
+    LiveRegs.stepBackward(MI);
   }
 
   while (!MIReplacements.empty()) {
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 4d9160f35226..2f0ab4ca9de4 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -1442,7 +1442,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
   assert(UpdatedSlot < StackTop && Dest < 7);
   Stack[UpdatedSlot]   = Dest;
   RegMap[Dest]         = UpdatedSlot;
-  MBB->getParent()->DeleteMachineInstr(&MI); // Remove the old instruction
+  MBB->getParent()->deleteMachineInstr(&MI); // Remove the old instruction
 }
 
 /// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index c29ae9f6af4c..0a7aea467809 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2496,8 +2496,8 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
   }
 
   // Assign slots for GPRs. It increases frame size.
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i - 1].getReg();
+  for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
+    unsigned Reg = I.getReg();
 
     if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
       continue;
@@ -2506,15 +2506,15 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
     CalleeSavedFrameSize += SlotSize;
 
     int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
-    CSI[i - 1].setFrameIdx(SlotIndex);
+    I.setFrameIdx(SlotIndex);
   }
 
   X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
   MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);
 
   // Assign slots for XMMs.
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i - 1].getReg();
+  for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
+    unsigned Reg = I.getReg();
     if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
       continue;
 
@@ -2533,7 +2533,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
     // spill into slot
     SpillSlotOffset -= Size;
     int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
-    CSI[i - 1].setFrameIdx(SlotIndex);
+    I.setFrameIdx(SlotIndex);
     MFI.ensureMaxAlignment(Alignment);
 
     // Save the start offset and size of XMM in stack frame for funclets.
@@ -2559,8 +2559,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
   // Push GPRs. It increases frame size.
   const MachineFunction &MF = *MBB.getParent();
   unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i - 1].getReg();
+  for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
+    unsigned Reg = I.getReg();
 
     if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
       continue;
@@ -2593,8 +2593,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
 
   // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
   // It can be done by spilling XMMs to stack frame.
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i-1].getReg();
+  for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
+    unsigned Reg = I.getReg();
     if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
       continue;
 
@@ -2607,8 +2607,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
     MBB.addLiveIn(Reg);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
 
-    TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
-                            TRI);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI);
     --MI;
     MI->setFlag(MachineInstr::FrameSetup);
     ++MI;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 62b2387396be..6f6361b6757b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1091,17 +1091,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SRL,              VT, Custom);
       setOperationAction(ISD::SHL,              VT, Custom);
       setOperationAction(ISD::SRA,              VT, Custom);
+      if (VT == MVT::v2i64) continue;
+      setOperationAction(ISD::ROTL,             VT, Custom);
+      setOperationAction(ISD::ROTR,             VT, Custom);
     }
 
-    setOperationAction(ISD::ROTL,               MVT::v4i32, Custom);
-    setOperationAction(ISD::ROTL,               MVT::v8i16, Custom);
-
-    // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
-    // shifts) is better.
-    if (!Subtarget.useAVX512Regs() &&
-        !(Subtarget.hasBWI() && Subtarget.hasVLX()))
-      setOperationAction(ISD::ROTL,             MVT::v16i8, Custom);
-
     setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
     setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);
     setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);
@@ -1199,8 +1193,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
-                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::ROTL, VT, Custom);
+      setOperationAction(ISD::ROTR, VT, Custom);
+    }
 
     // XOP can efficiently perform BITREVERSE with VPPERM.
     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
@@ -1283,6 +1279,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SRL, VT, Custom);
       setOperationAction(ISD::SHL, VT, Custom);
       setOperationAction(ISD::SRA, VT, Custom);
+      if (VT == MVT::v4i64) continue;
+      setOperationAction(ISD::ROTL, VT, Custom);
+      setOperationAction(ISD::ROTR, VT, Custom);
     }
 
     // These types need custom splitting if their input is a 128-bit vector.
@@ -1291,13 +1290,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
 
-    setOperationAction(ISD::ROTL,              MVT::v8i32,  Custom);
-    setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
-
-    // With BWI, expanding (and promoting the shifts) is the better.
-    if (!Subtarget.useBWIRegs())
-      setOperationAction(ISD::ROTL,            MVT::v32i8,  Custom);
-
     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
@@ -1662,6 +1654,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SRL,              VT, Custom);
       setOperationAction(ISD::SHL,              VT, Custom);
       setOperationAction(ISD::SRA,              VT, Custom);
+      setOperationAction(ISD::ROTL,             VT, Custom);
+      setOperationAction(ISD::ROTR,             VT, Custom);
       setOperationAction(ISD::SETCC,            VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
@@ -1676,16 +1670,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UMIN,             VT, Legal);
       setOperationAction(ISD::ABS,              VT, Legal);
       setOperationAction(ISD::CTPOP,            VT, Custom);
-      setOperationAction(ISD::ROTL,             VT, Custom);
-      setOperationAction(ISD::ROTR,             VT, Custom);
       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
     }
 
-    // With BWI, expanding (and promoting the shifts) is the better.
-    if (!Subtarget.useBWIRegs())
-      setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
-
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
       setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
@@ -5926,8 +5914,7 @@ static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
 /// from position Pos and ending in Pos+Size is undef or is zero.
 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
                                  unsigned Size) {
-  return llvm::all_of(Mask.slice(Pos, Size),
-                      [](int M) { return isUndefOrZero(M); });
+  return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
 }
 
 /// Helper function to test whether a shuffle mask could be
@@ -6788,12 +6775,33 @@ void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
   }
 }
 
+// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
+static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
+                                SDValue V1, SDValue V2, ArrayRef<int> Mask) {
+  if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
+      (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
+    SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
+    for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
+      int M = Mask[I];
+      if (M < 0)
+        continue;
+      SDValue V = (M < NumElts) ? V1 : V2;
+      if (V.isUndef())
+        continue;
+      Ops[I] = V.getOperand(M % NumElts);
+    }
+    return DAG.getBuildVector(VT, dl, Ops);
+  }
+
+  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+}
+
 /// Returns a vector_shuffle node for an unpackl operation.
 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
                           SDValue V1, SDValue V2) {
   SmallVector<int, 8> Mask;
   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
-  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+  return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
 }
 
 /// Returns a vector_shuffle node for an unpackh operation.
@@ -6801,12 +6809,11 @@ static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
                           SDValue V1, SDValue V2) {
   SmallVector<int, 8> Mask;
   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
-  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+  return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
 }
 
 /// Returns a node that packs the LHS + RHS nodes together at half width.
 /// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
-/// TODO: Add vXi64 -> vXi32 pack support with vector_shuffle node.
 /// TODO: Add subvector splitting if/when we have a need for it.
 static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
                        const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
@@ -6818,9 +6825,24 @@ static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
          VT.getSizeInBits() == OpVT.getSizeInBits() &&
          (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
          "Unexpected PACK operand types");
-  assert((EltSizeInBits == 8 || EltSizeInBits == 16) &&
+  assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
          "Unexpected PACK result type");
 
+  // Rely on vector shuffles for vXi64 -> vXi32 packing.
+  if (EltSizeInBits == 32) {
+    SmallVector<int> PackMask;
+    int Offset = PackHiHalf ? 1 : 0;
+    int NumElts = VT.getVectorNumElements();
+    for (int I = 0; I != NumElts; I += 4) {
+      PackMask.push_back(I + Offset);
+      PackMask.push_back(I + Offset + 2);
+      PackMask.push_back(I + Offset + NumElts);
+      PackMask.push_back(I + Offset + NumElts + 2);
+    }
+    return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
+                                DAG.getBitcast(VT, RHS), PackMask);
+  }
+
   // See if we already have sufficient leading bits for PACKSS/PACKUS.
   if (!PackHiHalf) {
     if (UsePackUS &&
@@ -15192,12 +15214,10 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
       // need
       // to balance this to ensure we don't form a 3-1 shuffle in the other
       // half.
-      int NumFlippedAToBInputs =
-          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
-          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
-      int NumFlippedBToBInputs =
-          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
-          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
+      int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
+                                 llvm::count(AToBInputs, 2 * ADWord + 1);
+      int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
+                                 llvm::count(BToBInputs, 2 * BDWord + 1);
       if ((NumFlippedAToBInputs == 1 &&
            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
           (NumFlippedBToBInputs == 1 &&
@@ -25599,6 +25619,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
 
 /// Handle vector element shifts where the shift amount may or may not be a
 /// constant. Takes immediate version of shift as input.
+/// TODO: Replace with vector + (splat) idx to avoid extract_element nodes.
 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                    SDValue SrcOp, SDValue ShAmt,
                                    const X86Subtarget &Subtarget,
@@ -25606,11 +25627,6 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
   MVT SVT = ShAmt.getSimpleValueType();
   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
 
-  // Catch shift-by-constant.
-  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
-    return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
-                                      CShAmt->getZExtValue(), DAG);
-
   // Change opcode to non-immediate version.
   Opc = getTargetVShiftUniformOpcode(Opc, true);
 
@@ -26342,10 +26358,19 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
                          DAG.getBitcast(MVT::i16, Ins));
     }
-    case VSHIFT:
+    case VSHIFT: {
+      SDValue SrcOp = Op.getOperand(1);
+      SDValue ShAmt = Op.getOperand(2);
+
+      // Catch shift-by-constant.
+      if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
+        return getTargetVShiftByConstNode(IntrData->Opc0, dl,
+                                          Op.getSimpleValueType(), SrcOp,
+                                          CShAmt->getZExtValue(), DAG);
+
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
-                                 Op.getOperand(1), Op.getOperand(2), Subtarget,
-                                 DAG);
+                                 SrcOp, ShAmt, Subtarget, DAG);
+    }
     case COMPRESS_EXPAND_IN_REG: {
       SDValue Mask = Op.getOperand(3);
       SDValue DataToCompress = Op.getOperand(1);
@@ -26638,7 +26663,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       X86CC = X86::COND_E;
       break;
     }
-    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+    SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
     SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
@@ -26653,7 +26678,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     else
       Opcode = X86ISD::PCMPESTR;
 
-    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+    SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps);
   }
@@ -26666,7 +26691,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     else
       Opcode = X86ISD::PCMPESTR;
 
-    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+    SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
   }
@@ -28892,10 +28917,13 @@ SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
 // supported by the Subtarget
 static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
                                         unsigned Opcode) {
+  if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
+    return false;
+
   if (VT.getScalarSizeInBits() < 16)
     return false;
 
-  if (VT.is512BitVector() && Subtarget.hasAVX512() &&
+  if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
     return true;
 
@@ -28919,6 +28947,8 @@ bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
 // natively supported by the Subtarget
 static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
                                     unsigned Opcode) {
+  if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
+    return false;
 
   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
     return false;
@@ -28927,7 +28957,8 @@ static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
     return false;
 
-  if (Subtarget.hasAVX512())
+  if (Subtarget.hasAVX512() &&
+      (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
     return true;
 
   bool LShift = VT.is128BitVector() || VT.is256BitVector();
@@ -28935,8 +28966,8 @@ static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
   return (Opcode == ISD::SRA) ? AShift : LShift;
 }
 
-static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
-                                         const X86Subtarget &Subtarget) {
+static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
+                                           const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
@@ -29066,8 +29097,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
-                                        const X86Subtarget &Subtarget) {
+static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
+                                          const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
@@ -29166,28 +29197,20 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
         (Subtarget.hasBWI() && VT == MVT::v64i8)))
     return SDValue();
 
-  if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
-    SmallVector<SDValue, 8> Elts;
-    MVT SVT = VT.getVectorElementType();
-    unsigned SVTBits = SVT.getSizeInBits();
-    APInt One(SVTBits, 1);
-    unsigned NumElems = VT.getVectorNumElements();
-
-    for (unsigned i = 0; i != NumElems; ++i) {
-      SDValue Op = Amt->getOperand(i);
-      if (Op->isUndef()) {
-        Elts.push_back(Op);
-        continue;
-      }
+  MVT SVT = VT.getVectorElementType();
+  unsigned SVTBits = SVT.getSizeInBits();
+  unsigned NumElems = VT.getVectorNumElements();
 
-      ConstantSDNode *ND = cast<ConstantSDNode>(Op);
-      APInt C(SVTBits, ND->getZExtValue());
-      uint64_t ShAmt = C.getZExtValue();
-      if (ShAmt >= SVTBits) {
-        Elts.push_back(DAG.getUNDEF(SVT));
+  APInt UndefElts;
+  SmallVector<APInt> EltBits;
+  if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
+    APInt One(SVTBits, 1);
+    SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
+    for (unsigned I = 0; I != NumElems; ++I) {
+      if (UndefElts[I] || EltBits[I].uge(SVTBits))
         continue;
-      }
-      Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
+      uint64_t ShAmt = EltBits[I].getZExtValue();
+      Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
     }
     return DAG.getBuildVector(VT, dl, Elts);
   }
@@ -29233,10 +29256,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   assert(VT.isVector() && "Custom lowering only for vector shifts!");
   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
 
-  if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
+  if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
     return V;
 
-  if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
+  if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
     return V;
 
   if (supportedVectorVarShift(VT, Subtarget, Opc))
@@ -29818,14 +29841,29 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
   }
 
-  assert(IsROTL && "Only ROTL supported");
+  SDValue Z = DAG.getConstant(0, DL, VT);
+
+  if (!IsROTL) {
+    // If the ISD::ROTR amount is constant, we're always better converting to
+    // ISD::ROTL.
+    if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
+      return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
+
+    // XOP targets always prefers ISD::ROTL.
+    if (Subtarget.hasXOP())
+      return DAG.getNode(ISD::ROTL, DL, VT, R,
+                         DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
+  }
+
+  // Split 256-bit integers on XOP/pre-AVX2 targets.
+  if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
+    return splitVectorIntBinary(Op, DAG);
 
   // XOP has 128-bit vector variable + immediate rotates.
   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
   // XOP implicitly uses modulo rotation amounts.
   if (Subtarget.hasXOP()) {
-    if (VT.is256BitVector())
-      return splitVectorIntBinary(Op, DAG);
+    assert(IsROTL && "Only ROTL expected");
     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
 
     // Attempt to rotate by immediate.
@@ -29839,55 +29877,89 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     return Op;
   }
 
-  // Split 256-bit integers on pre-AVX2 targets.
-  if (VT.is256BitVector() && !Subtarget.hasAVX2())
-    return splitVectorIntBinary(Op, DAG);
-
-  assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
-          ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
-            VT == MVT::v32i16) &&
-           Subtarget.hasAVX2())) &&
-         "Only vXi32/vXi16/vXi8 vector rotates supported");
-
   // Rotate by an uniform constant - expand back to shifts.
   if (IsCstSplat)
     return SDValue();
 
-  bool IsSplatAmt = DAG.isSplatValue(Amt);
-  SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
+  // Split 512-bit integers on non 512-bit BWI targets.
+  if (VT.is512BitVector() && !Subtarget.useBWIRegs())
+    return splitVectorIntBinary(Op, DAG);
 
-  // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
-  // the amount bit.
-  if (EltSizeInBits == 8) {
-    if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
-      return SDValue();
+  assert(
+      (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
+       ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
+        Subtarget.hasAVX2()) ||
+       ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
+      "Only vXi32/vXi16/vXi8 vector rotates supported");
 
-    // Check for a hidden ISD::ROTR, vXi8 lowering can handle both, but we
-    // currently hit infinite loops in legalization if we allow ISD::ROTR.
-    // FIXME: Infinite ROTL<->ROTR legalization in TargetLowering::expandROT.
-    SDValue HiddenROTRAmt;
-    if (Amt.getOpcode() == ISD::SUB &&
-        ISD::isBuildVectorAllZeros(Amt.getOperand(0).getNode()))
-      HiddenROTRAmt = Amt.getOperand(1);
+  MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
+  MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
 
-    MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+  SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
+  SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
 
-    // If the amount is a splat, attempt to fold as unpack(x,x) << zext(y):
-    // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
-    // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
-    if (SDValue BaseRotAmt = DAG.getSplatValue(DAG.getNode(
-            ISD::AND, DL, VT, HiddenROTRAmt ? HiddenROTRAmt : Amt, AmtMask))) {
-      unsigned ShiftX86Opc = HiddenROTRAmt ? X86ISD::VSRLI : X86ISD::VSHLI;
-      BaseRotAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseRotAmt);
+  // Attempt to fold as unpack(x,x) << zext(splat(y)):
+  // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
+  // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
+  // TODO: Handle vXi16 cases.
+  if (EltSizeInBits == 8 || EltSizeInBits == 32) {
+    if (SDValue BaseRotAmt = DAG.getSplatValue(AmtMod)) {
+      unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
       SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
       SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
+      BaseRotAmt = DAG.getZExtOrTrunc(BaseRotAmt, DL, MVT::i32);
       Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
                                Subtarget, DAG);
       Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
                                Subtarget, DAG);
-      return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !HiddenROTRAmt);
+      return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
+    }
+  }
+
+  // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
+  // the amount bit.
+  // TODO: We're doing nothing here that we couldn't do for funnel shifts.
+  if (EltSizeInBits == 8) {
+    bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+    MVT WideVT =
+        MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
+    unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
+
+    // Attempt to fold as:
+    // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
+    // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
+    if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
+        supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
+      // If we're rotating by constant, just use default promotion.
+      if (IsConstAmt)
+        return SDValue();
+      // See if we can perform this by widening to vXi16 or vXi32.
+      R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
+      R = DAG.getNode(
+          ISD::OR, DL, WideVT, R,
+          getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
+      Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
+      R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
+      if (IsROTL)
+        R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
     }
 
+    // Attempt to fold as unpack(x,x) << zext(y):
+    // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
+    // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
+    if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
+      // See if we can perform this by unpacking to lo/hi vXi16.
+      SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
+      SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
+      SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
+      SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
+      SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
+      SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
+      return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
+    }
+    assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type");
+
     // We don't need ModuloAmt here as we just peek at individual bits.
     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
       if (Subtarget.hasSSE41()) {
@@ -29907,15 +29979,15 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
       return DAG.getSelect(DL, SelVT, C, V0, V1);
     };
 
-    // 'Hidden' ROTR is currently only profitable on AVX512 targets where we
-    // have VPTERNLOG.
-    unsigned ShiftLHS = ISD::SHL;
-    unsigned ShiftRHS = ISD::SRL;
-    if (HiddenROTRAmt && useVPTERNLOG(Subtarget, VT)) {
-      std::swap(ShiftLHS, ShiftRHS);
-      Amt = HiddenROTRAmt;
+    // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
+    if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
+      Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
+      IsROTL = true;
     }
 
+    unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
+    unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
+
     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
     // We can safely do this using i16 shifts as we're only interested in
     // the 3 lower bits of each byte.
@@ -29952,18 +30024,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     return SignBitSelect(VT, Amt, M, R);
   }
 
-  // ISD::ROT* uses modulo rotate amounts.
-  if (SDValue BaseRotAmt = DAG.getSplatValue(Amt)) {
-    // If the amount is a splat, perform the modulo BEFORE the splat,
-    // this helps LowerScalarVariableShift to remove the splat later.
-    Amt = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BaseRotAmt);
-    Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
-    Amt = DAG.getVectorShuffle(VT, DL, Amt, DAG.getUNDEF(VT),
-                               SmallVector<int>(NumElts, 0));
-  } else {
-    Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
-  }
-
+  bool IsSplatAmt = DAG.isSplatValue(Amt);
   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
   bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
                         supportedVectorVarShift(VT, Subtarget, ISD::SRL);
@@ -29971,13 +30032,25 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   // Fallback for splats + all supported variable shifts.
   // Fallback for non-constants AVX2 vXi16 as well.
   if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
+    Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
     SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
     AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
-    SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
-    SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
+    SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
+    SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
     return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
   }
 
+  // Everything below assumes ISD::ROTL.
+  if (!IsROTL) {
+    Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
+    IsROTL = true;
+  }
+
+  // ISD::ROT* uses modulo rotate amounts.
+  Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
+
+  assert(IsROTL && "Only ROTL supported");
+
   // As with shifts, attempt to convert the rotation amount to a multiplication
   // factor, fallback to general expansion.
   SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
@@ -32927,11 +33000,6 @@ bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
   unsigned Bits = Ty->getScalarSizeInBits();
 
-  // 8-bit shifts are always expensive, but versions with a scalar amount aren't
-  // particularly cheaper than those without.
-  if (Bits == 8)
-    return false;
-
   // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
   // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
   if (Subtarget.hasXOP() &&
@@ -36249,9 +36317,10 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
         (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
          isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
       Shuffle = X86ISD::VZEXT_MOVL;
-      SrcVT = DstVT = MaskEltSize == 16      ? MVT::v8f16
-                      : !Subtarget.hasSSE2() ? MVT::v4f32
-                                             : MaskVT;
+      if (MaskEltSize == 16)
+        SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
+      else
+        SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
       return true;
     }
   }
@@ -36300,9 +36369,10 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
       isUndefOrEqual(Mask[0], 0) &&
       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
     Shuffle = X86ISD::VZEXT_MOVL;
-    SrcVT = DstVT = MaskEltSize == 16      ? MVT::v8f16
-                    : !Subtarget.hasSSE2() ? MVT::v4f32
-                                           : MaskVT;
+    if (MaskEltSize == 16)
+      SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
+    else
+      SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
     return true;
   }
 
@@ -40981,6 +41051,28 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
       Op, DemandedBits, DemandedElts, DAG, Depth);
 }
 
+bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
+                                                  const APInt &DemandedElts,
+                                                  APInt &UndefElts,
+                                                  unsigned Depth) const {
+  unsigned NumElts = DemandedElts.getBitWidth();
+  unsigned Opc = Op.getOpcode();
+
+  switch (Opc) {
+  case X86ISD::VBROADCAST:
+  case X86ISD::VBROADCAST_LOAD:
+    // TODO: Permit vXi64 types on 32-bit targets.
+    if (isTypeLegal(Op.getValueType().getVectorElementType())) {
+      UndefElts = APInt::getNullValue(NumElts);
+      return true;
+    }
+    return false;
+  }
+
+  return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
+                                                   Depth);
+}
+
 // Helper to peek through bitops/trunc/setcc to determine size of source vector.
 // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
 static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
@@ -46204,25 +46296,27 @@ static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // If this is SSE1 only convert to FAND to avoid scalarization.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
-    return DAG.getBitcast(
-        MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
-                                DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
-                                DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
+    return DAG.getBitcast(MVT::v4i32,
+                          DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
+                                      DAG.getBitcast(MVT::v4f32, N0),
+                                      DAG.getBitcast(MVT::v4f32, N1)));
   }
 
   // Use a 32-bit and+zext if upper bits known zero.
-  if (VT == MVT::i64 && Subtarget.is64Bit() &&
-      !isa<ConstantSDNode>(N->getOperand(1))) {
+  if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
     APInt HiMask = APInt::getHighBitsSet(64, 32);
-    if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
-        DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
-      SDLoc dl(N);
-      SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
-      SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
+    if (DAG.MaskedValueIsZero(N1, HiMask) ||
+        DAG.MaskedValueIsZero(N0, HiMask)) {
+      SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
+      SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
                          DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
     }
@@ -46235,8 +46329,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
     SmallVector<APInt, 2> SrcPartials;
     if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
         SrcOps.size() == 1) {
-      SDLoc dl(N);
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
@@ -46276,33 +46368,57 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
     return R;
 
-  // Attempt to recursively combine a bitmask AND with shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+    // Attempt to recursively combine a bitmask AND with shuffles.
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
       return Res;
+
+    // If either operand is a constant mask, then only the elements that aren't
+    // zero are actually demanded by the other operand.
+    auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
+      APInt UndefElts;
+      SmallVector<APInt> EltBits;
+      int NumElts = VT.getVectorNumElements();
+      int EltSizeInBits = VT.getScalarSizeInBits();
+      if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
+        return false;
+
+      APInt DemandedElts = APInt::getZero(NumElts);
+      for (int I = 0; I != NumElts; ++I)
+        if (!EltBits[I].isZero())
+          DemandedElts.setBit(I);
+
+      APInt KnownUndef, KnownZero;
+      return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
+                                            KnownZero, DCI);
+    };
+    if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
+      return SDValue(N, 0);
+    }
   }
 
   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
   if ((VT.getScalarSizeInBits() % 8) == 0 &&
-      N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
-    SDValue BitMask = N->getOperand(1);
-    SDValue SrcVec = N->getOperand(0).getOperand(0);
+      N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      isa<ConstantSDNode>(N0.getOperand(1))) {
+    SDValue BitMask = N1;
+    SDValue SrcVec = N0.getOperand(0);
     EVT SrcVecVT = SrcVec.getValueType();
 
     // Check that the constant bitmask masks whole bytes.
     APInt UndefElts;
     SmallVector<APInt, 64> EltBits;
-    if (VT == SrcVecVT.getScalarType() &&
-        N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
+    if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
         llvm::all_of(EltBits, [](const APInt &M) {
           return M.isZero() || M.isAllOnes();
         })) {
       unsigned NumElts = SrcVecVT.getVectorNumElements();
       unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
-      unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
+      unsigned Idx = N0.getConstantOperandVal(1);
 
       // Create a root shuffle mask from the byte mask and the extracted index.
       SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
@@ -46318,8 +46434,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
               X86::MaxShuffleCombineDepth,
               /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
               /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
-        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
-                           N->getOperand(0).getOperand(1));
+        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
+                           N0.getOperand(1));
     }
   }
 
@@ -46644,11 +46760,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // If this is SSE1 only convert to FOR to avoid scalarization.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
     return DAG.getBitcast(MVT::v4i32,
-                          DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
+                          DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
                                       DAG.getBitcast(MVT::v4f32, N0),
                                       DAG.getBitcast(MVT::v4f32, N1)));
   }
@@ -46660,8 +46778,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     SmallVector<APInt, 2> SrcPartials;
     if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
         SrcOps.size() == 1) {
-      SDLoc dl(N);
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
@@ -46707,7 +46823,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
         N1.getConstantOperandAPInt(1) == HalfElts &&
         DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
-      SDLoc dl(N);
       return DAG.getNode(
           ISD::CONCAT_VECTORS, dl, VT,
           extractSubVector(N0, 0, DAG, dl, HalfElts),
@@ -46716,7 +46831,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
         N0.getConstantOperandAPInt(1) == HalfElts &&
         DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
-      SDLoc dl(N);
       return DAG.getNode(
           ISD::CONCAT_VECTORS, dl, VT,
           extractSubVector(N1, 0, DAG, dl, HalfElts),
@@ -46724,11 +46838,36 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // Attempt to recursively combine an OR of shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+    // Attempt to recursively combine an OR of shuffles.
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
       return Res;
+
+    // If either operand is a constant mask, then only the elements that aren't
+    // allones are actually demanded by the other operand.
+    auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
+      APInt UndefElts;
+      SmallVector<APInt> EltBits;
+      int NumElts = VT.getVectorNumElements();
+      int EltSizeInBits = VT.getScalarSizeInBits();
+      if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
+        return false;
+
+      APInt DemandedElts = APInt::getZero(NumElts);
+      for (int I = 0; I != NumElts; ++I)
+        if (!EltBits[I].isAllOnes())
+          DemandedElts.setBit(I);
+
+      APInt KnownUndef, KnownZero;
+      return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
+                                            KnownZero, DCI);
+    };
+    if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
+      return SDValue(N, 0);
+    }
   }
 
   // We should fold "masked merge" patterns when `andn` is not available.
@@ -52111,7 +52250,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     case X86ISD::VSHLI:
     case X86ISD::VSRLI:
       // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
-      // TODO: Move this to LowerScalarImmediateShift?
+      // TODO: Move this to LowerShiftByScalarImmediate?
       if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
           llvm::all_of(Ops, [](SDValue Op) {
             return Op.getConstantOperandAPInt(1) == 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 6805cb75f0f2..d1d6e319f16b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1100,6 +1100,12 @@ namespace llvm {
 
     bool shouldSplatInsEltVarIndex(EVT VT) const override;
 
+    bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
+      // Converting to sat variants holds little benefit on X86 as we will just
+      // need to saturate the value back using fp arithmatic.
+      return Op != ISD::FP_TO_UINT_SAT && isOperationLegalOrCustom(Op, VT);
+    }
+
     bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
       return VT.isScalarInteger();
     }
@@ -1153,6 +1159,10 @@ namespace llvm {
         SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
         SelectionDAG &DAG, unsigned Depth) const override;
 
+    bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts,
+                                   APInt &UndefElts,
+                                   unsigned Depth) const override;
+
     const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
 
     SDValue unwrapAddress(SDValue N) const override;
diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 732b2b1a5ada..6642f46e64b2 100644
--- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -137,8 +137,10 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
       Changed |= addENDBR(MBB, MBB.begin());
 
     for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
-      if (I->isCall() && IsCallReturnTwice(I->getOperand(0)))
+      if (I->isCall() && I->getNumOperands() > 0 &&
+          IsCallReturnTwice(I->getOperand(0))) {
         Changed |= addENDBR(MBB, std::next(I));
+      }
     }
 
     // Exception handle may indirectly jump to catch pad, So we should add
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 1db83033ba35..ecd4777c3533 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -9958,74 +9958,74 @@ multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 defm VPMOVQB    : avx512_trunc_qb<0x32, "vpmovqb",
-                                  WriteShuffle256, truncstorevi8,
+                                  WriteVPMOV256, truncstorevi8,
                                   masked_truncstorevi8, X86vtrunc, X86vmtrunc>;
 defm VPMOVSQB   : avx512_trunc_qb<0x22, "vpmovsqb",
-                                  WriteShuffle256, truncstore_s_vi8,
+                                  WriteVPMOV256, truncstore_s_vi8,
                                   masked_truncstore_s_vi8, X86vtruncs,
                                   X86vmtruncs>;
 defm VPMOVUSQB  : avx512_trunc_qb<0x12, "vpmovusqb",
-                                  WriteShuffle256, truncstore_us_vi8,
+                                  WriteVPMOV256, truncstore_us_vi8,
                                   masked_truncstore_us_vi8, X86vtruncus, X86vmtruncus>;
 
 defm VPMOVQW    : avx512_trunc_qw<0x34, "vpmovqw", trunc, select_trunc,
-                                  WriteShuffle256, truncstorevi16,
+                                  WriteVPMOV256, truncstorevi16,
                                   masked_truncstorevi16, X86vtrunc, X86vmtrunc>;
 defm VPMOVSQW   : avx512_trunc_qw<0x24, "vpmovsqw",  X86vtruncs, select_truncs,
-                                  WriteShuffle256, truncstore_s_vi16,
+                                  WriteVPMOV256, truncstore_s_vi16,
                                   masked_truncstore_s_vi16, X86vtruncs,
                                   X86vmtruncs>;
 defm VPMOVUSQW  : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus,
-                                  select_truncus, WriteShuffle256,
+                                  select_truncus, WriteVPMOV256,
                                   truncstore_us_vi16, masked_truncstore_us_vi16,
                                   X86vtruncus, X86vmtruncus>;
 
 defm VPMOVQD    : avx512_trunc_qd<0x35, "vpmovqd", trunc, select_trunc,
-                                  WriteShuffle256, truncstorevi32,
+                                  WriteVPMOV256, truncstorevi32,
                                   masked_truncstorevi32, X86vtrunc, X86vmtrunc>;
 defm VPMOVSQD   : avx512_trunc_qd<0x25, "vpmovsqd",  X86vtruncs, select_truncs,
-                                  WriteShuffle256, truncstore_s_vi32,
+                                  WriteVPMOV256, truncstore_s_vi32,
                                   masked_truncstore_s_vi32, X86vtruncs,
                                   X86vmtruncs>;
 defm VPMOVUSQD  : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus,
-                                  select_truncus, WriteShuffle256,
+                                  select_truncus, WriteVPMOV256,
                                   truncstore_us_vi32, masked_truncstore_us_vi32,
                                   X86vtruncus, X86vmtruncus>;
 
 defm VPMOVDB    : avx512_trunc_db<0x31, "vpmovdb", trunc, select_trunc,
-                                  WriteShuffle256, truncstorevi8,
+                                  WriteVPMOV256, truncstorevi8,
                                   masked_truncstorevi8, X86vtrunc, X86vmtrunc>;
 defm VPMOVSDB   : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, select_truncs,
-                                  WriteShuffle256, truncstore_s_vi8,
+                                  WriteVPMOV256, truncstore_s_vi8,
                                   masked_truncstore_s_vi8, X86vtruncs,
                                   X86vmtruncs>;
 defm VPMOVUSDB  : avx512_trunc_db<0x11, "vpmovusdb",  X86vtruncus,
-                                  select_truncus, WriteShuffle256,
+                                  select_truncus, WriteVPMOV256,
                                   truncstore_us_vi8, masked_truncstore_us_vi8,
                                   X86vtruncus, X86vmtruncus>;
 
 defm VPMOVDW    : avx512_trunc_dw<0x33, "vpmovdw", trunc, select_trunc,
-                                  WriteShuffle256, truncstorevi16,
+                                  WriteVPMOV256, truncstorevi16,
                                   masked_truncstorevi16, X86vtrunc, X86vmtrunc>;
 defm VPMOVSDW   : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, select_truncs,
-                                  WriteShuffle256, truncstore_s_vi16,
+                                  WriteVPMOV256, truncstore_s_vi16,
                                   masked_truncstore_s_vi16, X86vtruncs,
                                   X86vmtruncs>;
 defm VPMOVUSDW  : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus,
-                                  select_truncus, WriteShuffle256,
+                                  select_truncus, WriteVPMOV256,
                                   truncstore_us_vi16, masked_truncstore_us_vi16,
                                   X86vtruncus, X86vmtruncus>;
 
 defm VPMOVWB    : avx512_trunc_wb<0x30, "vpmovwb", trunc, select_trunc,
-                                  WriteShuffle256, truncstorevi8,
+                                  WriteVPMOV256, truncstorevi8,
                                   masked_truncstorevi8, X86vtrunc,
                                   X86vmtrunc>;
 defm VPMOVSWB   : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, select_truncs,
-                                  WriteShuffle256, truncstore_s_vi8,
+                                  WriteVPMOV256, truncstore_s_vi8,
                                   masked_truncstore_s_vi8, X86vtruncs,
                                   X86vmtruncs>;
 defm VPMOVUSWB  : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus,
-                                  select_truncus, WriteShuffle256,
+                                  select_truncus, WriteVPMOV256,
                                   truncstore_us_vi8, masked_truncstore_us_vi8,
                                   X86vtruncus, X86vmtruncus>;
 
@@ -10084,7 +10084,7 @@ defm : mtrunc_lowering<"VPMOVSQWZ", X86vmtruncs, v8i16x_info, v8i64_info>;
 defm : mtrunc_lowering<"VPMOVUSQWZ", X86vmtruncus, v8i16x_info, v8i64_info>;
 }
 
-multiclass WriteShuffle256_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
+multiclass avx512_pmovx_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
               X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
               X86MemOperand x86memop, PatFrag LdFrag, SDNode OpNode>{
   let ExeDomain = DestInfo.ExeDomain in {
@@ -10100,135 +10100,140 @@ multiclass WriteShuffle256_common<bits<8> opc, string OpcodeStr, X86FoldableSche
   }
 }
 
-multiclass WriteShuffle256_BW<bits<8> opc, string OpcodeStr,
+multiclass avx512_pmovx_bw<bits<8> opc, string OpcodeStr,
           SDNode OpNode, SDNode InVecNode, string ExtTy,
-          X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+          X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ,
+          PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasBWI] in {
-    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v8i16x_info,
+    defm Z128:  avx512_pmovx_common<opc, OpcodeStr, schedX, v8i16x_info,
                     v16i8x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128, VEX_WIG;
 
-    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v16i16x_info,
+    defm Z256:  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v16i16x_info,
                     v16i8x_info, i128mem, LdFrag, OpNode>,
                      EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasBWI] in {
-    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v32i16_info,
+    defm Z   :  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v32i16_info,
                     v32i8x_info, i256mem, LdFrag, OpNode>,
                      EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512, VEX_WIG;
   }
 }
 
-multiclass WriteShuffle256_BD<bits<8> opc, string OpcodeStr,
+multiclass avx512_pmovx_bd<bits<8> opc, string OpcodeStr,
           SDNode OpNode, SDNode InVecNode, string ExtTy,
-          X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+          X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ,
+          PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info,
+    defm Z128:  avx512_pmovx_common<opc, OpcodeStr, schedX, v4i32x_info,
                    v16i8x_info, i32mem, LdFrag, InVecNode>,
                          EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
 
-    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info,
+    defm Z256:  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v8i32x_info,
                    v16i8x_info, i64mem, LdFrag, InVecNode>,
                          EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasAVX512] in {
-    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info,
+    defm Z   :  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v16i32_info,
                    v16i8x_info, i128mem, LdFrag, OpNode>,
                          EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512, VEX_WIG;
   }
 }
 
-multiclass WriteShuffle256_BQ<bits<8> opc, string OpcodeStr,
+multiclass avx512_pmovx_bq<bits<8> opc, string OpcodeStr,
                               SDNode InVecNode, string ExtTy,
-                              X86FoldableSchedWrite sched,
+                              X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ,
                               PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
+    defm Z128:  avx512_pmovx_common<opc, OpcodeStr, schedX, v2i64x_info,
                    v16i8x_info, i16mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG;
 
-    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
+    defm Z256:  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v4i64x_info,
                    v16i8x_info, i32mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasAVX512] in {
-    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
+    defm Z   :  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v8i64_info,
                    v16i8x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG;
   }
 }
 
-multiclass WriteShuffle256_WD<bits<8> opc, string OpcodeStr,
+multiclass avx512_pmovx_wd<bits<8> opc, string OpcodeStr,
          SDNode OpNode, SDNode InVecNode, string ExtTy,
-         X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+         X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ,
+         PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info,
+    defm Z128:  avx512_pmovx_common<opc, OpcodeStr, schedX, v4i32x_info,
                    v8i16x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128, VEX_WIG;
 
-    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info,
+    defm Z256:  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v8i32x_info,
                    v8i16x_info, i128mem, LdFrag, OpNode>,
                      EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasAVX512] in {
-    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info,
+    defm Z   :  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v16i32_info,
                    v16i16x_info, i256mem, LdFrag, OpNode>,
                      EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512, VEX_WIG;
   }
 }
 
-multiclass WriteShuffle256_WQ<bits<8> opc, string OpcodeStr,
+multiclass avx512_pmovx_wq<bits<8> opc, string OpcodeStr,
          SDNode OpNode, SDNode InVecNode, string ExtTy,
-         X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+         X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ,
+         PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
+    defm Z128:  avx512_pmovx_common<opc, OpcodeStr, schedX, v2i64x_info,
                    v8i16x_info, i32mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
 
-    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
+    defm Z256:  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v4i64x_info,
                    v8i16x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
   }
   let Predicates = [HasAVX512] in {
-    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
+    defm Z   :  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v8i64_info,
                    v8i16x_info, i128mem, LdFrag, OpNode>,
                      EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512, VEX_WIG;
   }
 }
 
-multiclass WriteShuffle256_DQ<bits<8> opc, string OpcodeStr,
+multiclass avx512_pmovx_dq<bits<8> opc, string OpcodeStr,
          SDNode OpNode, SDNode InVecNode, string ExtTy,
-         X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
+         X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ,
+         PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
 
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z128:  WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info,
+    defm Z128:  avx512_pmovx_common<opc, OpcodeStr, schedX, v2i64x_info,
                    v4i32x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
 
-    defm Z256:  WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info,
+    defm Z256:  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v4i64x_info,
                    v4i32x_info, i128mem, LdFrag, OpNode>,
                      EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
   }
   let Predicates = [HasAVX512] in {
-    defm Z   :  WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info,
+    defm Z   :  avx512_pmovx_common<opc, OpcodeStr, schedYZ, v8i64_info,
                    v8i32x_info, i256mem, LdFrag, OpNode>,
                      EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
   }
 }
 
-defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", zext, zext_invec, "z", WriteShuffle256>;
-defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", zext, zext_invec, "z", WriteShuffle256>;
-defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq",       zext_invec, "z", WriteShuffle256>;
-defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", zext, zext_invec, "z", WriteShuffle256>;
-defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", zext, zext_invec, "z", WriteShuffle256>;
-defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", zext, zext_invec, "z", WriteShuffle256>;
+defm VPMOVZXBW : avx512_pmovx_bw<0x30, "vpmovzxbw", zext, zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>;
+defm VPMOVZXBD : avx512_pmovx_bd<0x31, "vpmovzxbd", zext, zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>;
+defm VPMOVZXBQ : avx512_pmovx_bq<0x32, "vpmovzxbq",       zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>;
+defm VPMOVZXWD : avx512_pmovx_wd<0x33, "vpmovzxwd", zext, zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>;
+defm VPMOVZXWQ : avx512_pmovx_wq<0x34, "vpmovzxwq", zext, zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>;
+defm VPMOVZXDQ : avx512_pmovx_dq<0x35, "vpmovzxdq", zext, zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>;
 
-defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", sext, sext_invec, "s", WriteShuffle256>;
-defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", sext, sext_invec, "s", WriteShuffle256>;
-defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq",       sext_invec, "s", WriteShuffle256>;
-defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", sext, sext_invec, "s", WriteShuffle256>;
-defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", sext, sext_invec, "s", WriteShuffle256>;
-defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", sext, sext_invec, "s", WriteShuffle256>;
+defm VPMOVSXBW: avx512_pmovx_bw<0x20, "vpmovsxbw", sext, sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>;
+defm VPMOVSXBD: avx512_pmovx_bd<0x21, "vpmovsxbd", sext, sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>;
+defm VPMOVSXBQ: avx512_pmovx_bq<0x22, "vpmovsxbq",       sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>;
+defm VPMOVSXWD: avx512_pmovx_wd<0x23, "vpmovsxwd", sext, sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>;
+defm VPMOVSXWQ: avx512_pmovx_wq<0x24, "vpmovsxwq", sext, sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>;
+defm VPMOVSXDQ: avx512_pmovx_dq<0x25, "vpmovsxdq", sext, sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>;
 
 
 // Patterns that we also need any extend versions of. aext_vector_inreg
@@ -10523,21 +10528,22 @@ defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd
 defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
                      VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
 
-multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
+multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr, SchedWrite Sched> {
 def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
                   !strconcat(OpcodeStr#Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
                   [(set Vec.RC:$dst, (Vec.VT (sext Vec.KRC:$src)))]>,
-                  EVEX, Sched<[WriteMove]>; // TODO - WriteVecTrunc?
+                  EVEX, Sched<[Sched]>;
 }
 
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
                                  string OpcodeStr, Predicate prd> {
+// TODO - Replace WriteMove with WriteVecTrunc?
 let Predicates = [prd] in
-  defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+  defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr, WriteMove>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
-    defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+    defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr, WriteMove>, EVEX_V256;
+    defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr, WriteMove>, EVEX_V128;
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ba52283b570d..7288ce812138 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -260,10 +260,10 @@ let isPseudo = 1, SchedRW = [WriteSystem] in {
 // Pseudo instructions used by address sanitizer.
 //===----------------------------------------------------------------------===//
 let
-  Defs = [R8, EFLAGS] in {
+  Defs = [R10, R11, EFLAGS] in {
 def ASAN_CHECK_MEMACCESS : PseudoI<
-  (outs), (ins GR64NoR8:$addr, i32imm:$accessinfo),
-  [(int_asan_check_memaccess GR64NoR8:$addr, (i32 timm:$accessinfo))]>,
+  (outs), (ins GR64PLTSafe:$addr, i32imm:$accessinfo),
+  [(int_asan_check_memaccess GR64PLTSafe:$addr, (i32 timm:$accessinfo))]>,
   Sched<[]>;
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 6d4ad08842c7..226349485238 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -529,11 +529,11 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::LZCNT16rr,            X86::LZCNT16rm,            0 },
   { X86::LZCNT32rr,            X86::LZCNT32rm,            0 },
   { X86::LZCNT64rr,            X86::LZCNT64rm,            0 },
-  { X86::MMX_CVTPD2PIirr,      X86::MMX_CVTPD2PIirm,      TB_ALIGN_16 },
-  { X86::MMX_CVTPI2PDirr,      X86::MMX_CVTPI2PDirm,      0 },
-  { X86::MMX_CVTPS2PIirr,      X86::MMX_CVTPS2PIirm,      TB_NO_REVERSE },
-  { X86::MMX_CVTTPD2PIirr,     X86::MMX_CVTTPD2PIirm,     TB_ALIGN_16 },
-  { X86::MMX_CVTTPS2PIirr,     X86::MMX_CVTTPS2PIirm,     TB_NO_REVERSE },
+  { X86::MMX_CVTPD2PIrr,       X86::MMX_CVTPD2PIrm,       TB_ALIGN_16 },
+  { X86::MMX_CVTPI2PDrr,       X86::MMX_CVTPI2PDrm,       0 },
+  { X86::MMX_CVTPS2PIrr,       X86::MMX_CVTPS2PIrm,       TB_NO_REVERSE },
+  { X86::MMX_CVTTPD2PIrr,      X86::MMX_CVTTPD2PIrm,      TB_ALIGN_16 },
+  { X86::MMX_CVTTPS2PIrr,      X86::MMX_CVTTPS2PIrm,      TB_NO_REVERSE },
   { X86::MMX_MOVD64to64rr,     X86::MMX_MOVQ64rm,         0 },
   { X86::MMX_PABSBrr,          X86::MMX_PABSBrm,          0 },
   { X86::MMX_PABSDrr,          X86::MMX_PABSDrm,          0 },
@@ -1339,29 +1339,29 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::MINSDrr_Int,              X86::MINSDrm_Int,              TB_NO_REVERSE },
   { X86::MINSSrr,                  X86::MINSSrm,                  0 },
   { X86::MINSSrr_Int,              X86::MINSSrm_Int,              TB_NO_REVERSE },
-  { X86::MMX_CVTPI2PSirr,          X86::MMX_CVTPI2PSirm,          0 },
-  { X86::MMX_PACKSSDWirr,          X86::MMX_PACKSSDWirm,          0 },
-  { X86::MMX_PACKSSWBirr,          X86::MMX_PACKSSWBirm,          0 },
-  { X86::MMX_PACKUSWBirr,          X86::MMX_PACKUSWBirm,          0 },
-  { X86::MMX_PADDBirr,             X86::MMX_PADDBirm,             0 },
-  { X86::MMX_PADDDirr,             X86::MMX_PADDDirm,             0 },
-  { X86::MMX_PADDQirr,             X86::MMX_PADDQirm,             0 },
-  { X86::MMX_PADDSBirr,            X86::MMX_PADDSBirm,            0 },
-  { X86::MMX_PADDSWirr,            X86::MMX_PADDSWirm,            0 },
-  { X86::MMX_PADDUSBirr,           X86::MMX_PADDUSBirm,           0 },
-  { X86::MMX_PADDUSWirr,           X86::MMX_PADDUSWirm,           0 },
-  { X86::MMX_PADDWirr,             X86::MMX_PADDWirm,             0 },
+  { X86::MMX_CVTPI2PSrr,           X86::MMX_CVTPI2PSrm,           0 },
+  { X86::MMX_PACKSSDWrr,           X86::MMX_PACKSSDWrm,           0 },
+  { X86::MMX_PACKSSWBrr,           X86::MMX_PACKSSWBrm,           0 },
+  { X86::MMX_PACKUSWBrr,           X86::MMX_PACKUSWBrm,           0 },
+  { X86::MMX_PADDBrr,              X86::MMX_PADDBrm,              0 },
+  { X86::MMX_PADDDrr,              X86::MMX_PADDDrm,              0 },
+  { X86::MMX_PADDQrr,              X86::MMX_PADDQrm,              0 },
+  { X86::MMX_PADDSBrr,             X86::MMX_PADDSBrm,             0 },
+  { X86::MMX_PADDSWrr,             X86::MMX_PADDSWrm,             0 },
+  { X86::MMX_PADDUSBrr,            X86::MMX_PADDUSBrm,            0 },
+  { X86::MMX_PADDUSWrr,            X86::MMX_PADDUSWrm,            0 },
+  { X86::MMX_PADDWrr,              X86::MMX_PADDWrm,              0 },
   { X86::MMX_PALIGNRrri,           X86::MMX_PALIGNRrmi,           0 },
-  { X86::MMX_PANDNirr,             X86::MMX_PANDNirm,             0 },
-  { X86::MMX_PANDirr,              X86::MMX_PANDirm,              0 },
-  { X86::MMX_PAVGBirr,             X86::MMX_PAVGBirm,             0 },
-  { X86::MMX_PAVGWirr,             X86::MMX_PAVGWirm,             0 },
-  { X86::MMX_PCMPEQBirr,           X86::MMX_PCMPEQBirm,           0 },
-  { X86::MMX_PCMPEQDirr,           X86::MMX_PCMPEQDirm,           0 },
-  { X86::MMX_PCMPEQWirr,           X86::MMX_PCMPEQWirm,           0 },
-  { X86::MMX_PCMPGTBirr,           X86::MMX_PCMPGTBirm,           0 },
-  { X86::MMX_PCMPGTDirr,           X86::MMX_PCMPGTDirm,           0 },
-  { X86::MMX_PCMPGTWirr,           X86::MMX_PCMPGTWirm,           0 },
+  { X86::MMX_PANDNrr,              X86::MMX_PANDNrm,              0 },
+  { X86::MMX_PANDrr,               X86::MMX_PANDrm,               0 },
+  { X86::MMX_PAVGBrr,              X86::MMX_PAVGBrm,              0 },
+  { X86::MMX_PAVGWrr,              X86::MMX_PAVGWrm,              0 },
+  { X86::MMX_PCMPEQBrr,            X86::MMX_PCMPEQBrm,            0 },
+  { X86::MMX_PCMPEQDrr,            X86::MMX_PCMPEQDrm,            0 },
+  { X86::MMX_PCMPEQWrr,            X86::MMX_PCMPEQWrm,            0 },
+  { X86::MMX_PCMPGTBrr,            X86::MMX_PCMPGTBrm,            0 },
+  { X86::MMX_PCMPGTDrr,            X86::MMX_PCMPGTDrm,            0 },
+  { X86::MMX_PCMPGTWrr,            X86::MMX_PCMPGTWrm,            0 },
   { X86::MMX_PHADDDrr,             X86::MMX_PHADDDrm,             0 },
   { X86::MMX_PHADDSWrr,            X86::MMX_PHADDSWrm,            0 },
   { X86::MMX_PHADDWrr,             X86::MMX_PHADDWrm,             0 },
@@ -1370,18 +1370,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::MMX_PHSUBWrr,             X86::MMX_PHSUBWrm,             0 },
   { X86::MMX_PINSRWrr,             X86::MMX_PINSRWrm,             TB_NO_REVERSE },
   { X86::MMX_PMADDUBSWrr,          X86::MMX_PMADDUBSWrm,          0 },
-  { X86::MMX_PMADDWDirr,           X86::MMX_PMADDWDirm,           0 },
-  { X86::MMX_PMAXSWirr,            X86::MMX_PMAXSWirm,            0 },
-  { X86::MMX_PMAXUBirr,            X86::MMX_PMAXUBirm,            0 },
-  { X86::MMX_PMINSWirr,            X86::MMX_PMINSWirm,            0 },
-  { X86::MMX_PMINUBirr,            X86::MMX_PMINUBirm,            0 },
+  { X86::MMX_PMADDWDrr,            X86::MMX_PMADDWDrm,            0 },
+  { X86::MMX_PMAXSWrr,             X86::MMX_PMAXSWrm,             0 },
+  { X86::MMX_PMAXUBrr,             X86::MMX_PMAXUBrm,             0 },
+  { X86::MMX_PMINSWrr,             X86::MMX_PMINSWrm,             0 },
+  { X86::MMX_PMINUBrr,             X86::MMX_PMINUBrm,             0 },
   { X86::MMX_PMULHRSWrr,           X86::MMX_PMULHRSWrm,           0 },
-  { X86::MMX_PMULHUWirr,           X86::MMX_PMULHUWirm,           0 },
-  { X86::MMX_PMULHWirr,            X86::MMX_PMULHWirm,            0 },
-  { X86::MMX_PMULLWirr,            X86::MMX_PMULLWirm,            0 },
-  { X86::MMX_PMULUDQirr,           X86::MMX_PMULUDQirm,           0 },
-  { X86::MMX_PORirr,               X86::MMX_PORirm,               0 },
-  { X86::MMX_PSADBWirr,            X86::MMX_PSADBWirm,            0 },
+  { X86::MMX_PMULHUWrr,            X86::MMX_PMULHUWrm,            0 },
+  { X86::MMX_PMULHWrr,             X86::MMX_PMULHWrm,             0 },
+  { X86::MMX_PMULLWrr,             X86::MMX_PMULLWrm,             0 },
+  { X86::MMX_PMULUDQrr,            X86::MMX_PMULUDQrm,            0 },
+  { X86::MMX_PORrr,                X86::MMX_PORrm,                0 },
+  { X86::MMX_PSADBWrr,             X86::MMX_PSADBWrm,             0 },
   { X86::MMX_PSHUFBrr,             X86::MMX_PSHUFBrm,             0 },
   { X86::MMX_PSIGNBrr,             X86::MMX_PSIGNBrm,             0 },
   { X86::MMX_PSIGNDrr,             X86::MMX_PSIGNDrm,             0 },
@@ -1394,21 +1394,21 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::MMX_PSRLDrr,              X86::MMX_PSRLDrm,              0 },
   { X86::MMX_PSRLQrr,              X86::MMX_PSRLQrm,              0 },
   { X86::MMX_PSRLWrr,              X86::MMX_PSRLWrm,              0 },
-  { X86::MMX_PSUBBirr,             X86::MMX_PSUBBirm,             0 },
-  { X86::MMX_PSUBDirr,             X86::MMX_PSUBDirm,             0 },
-  { X86::MMX_PSUBQirr,             X86::MMX_PSUBQirm,             0 },
-  { X86::MMX_PSUBSBirr,            X86::MMX_PSUBSBirm,            0 },
-  { X86::MMX_PSUBSWirr,            X86::MMX_PSUBSWirm,            0 },
-  { X86::MMX_PSUBUSBirr,           X86::MMX_PSUBUSBirm,           0 },
-  { X86::MMX_PSUBUSWirr,           X86::MMX_PSUBUSWirm,           0 },
-  { X86::MMX_PSUBWirr,             X86::MMX_PSUBWirm,             0 },
-  { X86::MMX_PUNPCKHBWirr,         X86::MMX_PUNPCKHBWirm,         0 },
-  { X86::MMX_PUNPCKHDQirr,         X86::MMX_PUNPCKHDQirm,         0 },
-  { X86::MMX_PUNPCKHWDirr,         X86::MMX_PUNPCKHWDirm,         0 },
-  { X86::MMX_PUNPCKLBWirr,         X86::MMX_PUNPCKLBWirm,         TB_NO_REVERSE },
-  { X86::MMX_PUNPCKLDQirr,         X86::MMX_PUNPCKLDQirm,         TB_NO_REVERSE },
-  { X86::MMX_PUNPCKLWDirr,         X86::MMX_PUNPCKLWDirm,         TB_NO_REVERSE },
-  { X86::MMX_PXORirr,              X86::MMX_PXORirm,              0 },
+  { X86::MMX_PSUBBrr,              X86::MMX_PSUBBrm,              0 },
+  { X86::MMX_PSUBDrr,              X86::MMX_PSUBDrm,              0 },
+  { X86::MMX_PSUBQrr,              X86::MMX_PSUBQrm,              0 },
+  { X86::MMX_PSUBSBrr,             X86::MMX_PSUBSBrm,             0 },
+  { X86::MMX_PSUBSWrr,             X86::MMX_PSUBSWrm,             0 },
+  { X86::MMX_PSUBUSBrr,            X86::MMX_PSUBUSBrm,            0 },
+  { X86::MMX_PSUBUSWrr,            X86::MMX_PSUBUSWrm,            0 },
+  { X86::MMX_PSUBWrr,              X86::MMX_PSUBWrm,              0 },
+  { X86::MMX_PUNPCKHBWrr,          X86::MMX_PUNPCKHBWrm,          0 },
+  { X86::MMX_PUNPCKHDQrr,          X86::MMX_PUNPCKHDQrm,          0 },
+  { X86::MMX_PUNPCKHWDrr,          X86::MMX_PUNPCKHWDrm,          0 },
+  { X86::MMX_PUNPCKLBWrr,          X86::MMX_PUNPCKLBWrm,          TB_NO_REVERSE },
+  { X86::MMX_PUNPCKLDQrr,          X86::MMX_PUNPCKLDQrm,          TB_NO_REVERSE },
+  { X86::MMX_PUNPCKLWDrr,          X86::MMX_PUNPCKLWDrm,          TB_NO_REVERSE },
+  { X86::MMX_PXORrr,               X86::MMX_PXORrm,               0 },
   { X86::MOVLHPSrr,                X86::MOVHPSrm,                 TB_NO_REVERSE },
   { X86::MOVSDrr,                  X86::MOVLPDrm,                 TB_NO_REVERSE },
   { X86::MPSADBWrri,               X86::MPSADBWrmi,               TB_ALIGN_16 },
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index bb5637a31947..c379aa8d9258 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4088,8 +4088,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
 bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
                                         Register SrcReg, Register SrcReg2,
                                         int64_t ImmMask, int64_t ImmValue,
-                                        const MachineInstr &OI, bool *IsSwapped,
-                                        int64_t *ImmDelta) const {
+                                        const MachineInstr &OI,
+                                        bool *IsSwapped) const {
   switch (OI.getOpcode()) {
   case X86::CMP64rr:
   case X86::CMP32rr:
@@ -4140,21 +4140,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
       int64_t OIMask;
       int64_t OIValue;
       if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
-          SrcReg == OISrcReg && ImmMask == OIMask) {
-        if (OIValue == ImmValue) {
-          *ImmDelta = 0;
-          return true;
-        } else if (static_cast<uint64_t>(ImmValue) ==
-                   static_cast<uint64_t>(OIValue) - 1) {
-          *ImmDelta = -1;
-          return true;
-        } else if (static_cast<uint64_t>(ImmValue) ==
-                   static_cast<uint64_t>(OIValue) + 1) {
-          *ImmDelta = 1;
-          return true;
-        } else {
-          return false;
-        }
+          SrcReg == OISrcReg && ImmMask == OIMask && OIValue == ImmValue) {
+        assert(SrcReg2 == X86::NoRegister && OISrcReg2 == X86::NoRegister &&
+               "should not have 2nd register");
+        return true;
       }
     }
     return FlagI.isIdenticalTo(OI);
@@ -4404,7 +4393,6 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   bool ShouldUpdateCC = false;
   bool IsSwapped = false;
   X86::CondCode NewCC = X86::COND_INVALID;
-  int64_t ImmDelta = 0;
 
   // Search backward from CmpInstr for the next instruction defining EFLAGS.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -4451,7 +4439,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
         //     ...           // EFLAGS not changed
         //     cmp x, y      // <-- can be removed
         if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
-                                 Inst, &IsSwapped, &ImmDelta)) {
+                                 Inst, &IsSwapped)) {
           Sub = &Inst;
           break;
         }
@@ -4485,7 +4473,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
   // If we are done with the basic block, we need to check whether EFLAGS is
   // live-out.
-  bool FlagsMayLiveOut = true;
+  bool IsSafe = false;
   SmallVector<std::pair<MachineInstr*, X86::CondCode>, 4> OpsToUpdate;
   MachineBasicBlock::iterator AfterCmpInstr =
       std::next(MachineBasicBlock::iterator(CmpInstr));
@@ -4495,7 +4483,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     // We should check the usage if this instruction uses and updates EFLAGS.
     if (!UseEFLAGS && ModifyEFLAGS) {
       // It is safe to remove CmpInstr if EFLAGS is updated again.
-      FlagsMayLiveOut = false;
+      IsSafe = true;
       break;
     }
     if (!UseEFLAGS && !ModifyEFLAGS)
@@ -4503,7 +4491,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
 
     // EFLAGS is used by this instruction.
     X86::CondCode OldCC = X86::COND_INVALID;
-    if (MI || IsSwapped || ImmDelta != 0) {
+    if (MI || IsSwapped) {
       // We decode the condition code from opcode.
       if (Instr.isBranch())
         OldCC = X86::getCondFromBranch(Instr);
@@ -4555,60 +4543,11 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
       // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
       // We swap the condition code and synthesize the new opcode.
       ReplacementCC = getSwappedCondition(OldCC);
-      if (ReplacementCC == X86::COND_INVALID) return false;
-      ShouldUpdateCC = true;
-    } else if (ImmDelta != 0) {
-      unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
-      // Shift amount for min/max constants to adjust for 8/16/32 instruction
-      // sizes.
-      switch (OldCC) {
-      case X86::COND_L: // x <s (C + 1)  -->  x <=s C
-        if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
-          return false;
-        ReplacementCC = X86::COND_LE;
-        break;
-      case X86::COND_B: // x <u (C + 1)  -->  x <=u C
-        if (ImmDelta != 1 || CmpValue == 0)
-          return false;
-        ReplacementCC = X86::COND_BE;
-        break;
-      case X86::COND_GE: // x >=s (C + 1)  -->  x >s C
-        if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
-          return false;
-        ReplacementCC = X86::COND_G;
-        break;
-      case X86::COND_AE: // x >=u (C + 1)  -->  x >u C
-        if (ImmDelta != 1 || CmpValue == 0)
-          return false;
-        ReplacementCC = X86::COND_A;
-        break;
-      case X86::COND_G: // x >s (C - 1)  -->  x >=s C
-        if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
-          return false;
-        ReplacementCC = X86::COND_GE;
-        break;
-      case X86::COND_A: // x >u (C - 1)  -->  x >=u C
-        if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
-          return false;
-        ReplacementCC = X86::COND_AE;
-        break;
-      case X86::COND_LE: // x <=s (C - 1)  -->  x <s C
-        if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
-          return false;
-        ReplacementCC = X86::COND_L;
-        break;
-      case X86::COND_BE: // x <=u (C - 1)  -->  x <u C
-        if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
-          return false;
-        ReplacementCC = X86::COND_B;
-        break;
-      default:
+      if (ReplacementCC == X86::COND_INVALID)
         return false;
-      }
-      ShouldUpdateCC = true;
     }
 
-    if (ShouldUpdateCC && ReplacementCC != OldCC) {
+    if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) {
       // Push the MachineInstr to OpsToUpdate.
       // If it is safe to remove CmpInstr, the condition code of these
       // instructions will be modified.
@@ -4616,14 +4555,14 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     }
     if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
       // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
-      FlagsMayLiveOut = false;
+      IsSafe = true;
       break;
     }
   }
 
-  // If we have to update users but EFLAGS is live-out abort, since we cannot
-  // easily find all of the users.
-  if (ShouldUpdateCC && FlagsMayLiveOut) {
+  // If EFLAGS is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if ((MI || IsSwapped) && !IsSafe) {
     for (MachineBasicBlock *Successor : CmpMBB.successors())
       if (Successor->isLiveIn(X86::EFLAGS))
         return false;
@@ -4944,7 +4883,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::SETB_C64r:
     return Expand2AddrUndef(MIB, get(X86::SBB64rr));
   case X86::MMX_SET0:
-    return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr));
+    return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
   case X86::V_SET0:
   case X86::FsFLD0SS:
   case X86::FsFLD0SD:
@@ -5217,12 +5156,12 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
                               bool ForLoadFold = false) {
   // Set the OpNum parameter to the first source operand.
   switch (Opcode) {
-  case X86::MMX_PUNPCKHBWirr:
-  case X86::MMX_PUNPCKHWDirr:
-  case X86::MMX_PUNPCKHDQirr:
-  case X86::MMX_PUNPCKLBWirr:
-  case X86::MMX_PUNPCKLWDirr:
-  case X86::MMX_PUNPCKLDQirr:
+  case X86::MMX_PUNPCKHBWrr:
+  case X86::MMX_PUNPCKHWDrr:
+  case X86::MMX_PUNPCKHDQrr:
+  case X86::MMX_PUNPCKLBWrr:
+  case X86::MMX_PUNPCKLWDrr:
+  case X86::MMX_PUNPCKLDQrr:
   case X86::MOVHLPSrr:
   case X86::PACKSSWBrr:
   case X86::PACKUSWBrr:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 33ce55bbdb2b..537ada6222bf 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -643,8 +643,7 @@ private:
   ///   CMP %1, %2   and  %3 = SUB %2, %1  ; IsSwapped=true
   bool isRedundantFlagInstr(const MachineInstr &FlagI, Register SrcReg,
                             Register SrcReg2, int64_t ImmMask, int64_t ImmValue,
-                            const MachineInstr &OI, bool *IsSwapped,
-                            int64_t *ImmDelta) const;
+                            const MachineInstr &OI, bool *IsSwapped) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index bb3e6df3bf3e..aeecc25ddea2 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -34,14 +34,14 @@ let Constraints = "$src1 = $dst" in {
   multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
                                X86FoldableSchedWrite sched, bit Commutable = 0,
                                X86MemOperand OType = i64mem> {
-    def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
                  (ins VR64:$src1, VR64:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>,
               Sched<[sched]> {
       let isCommutable = Commutable;
     }
-    def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
                  (ins VR64:$src1, OType:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                  [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>,
@@ -123,25 +123,25 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId,
 multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
                          string asm, X86FoldableSchedWrite sched, Domain d> {
-  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-                  [(set DstRC:$dst, (Int SrcRC:$src))], d>,
-            Sched<[sched]>;
-  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-                  [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>,
-            Sched<[sched.Folded]>;
+  def rr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                 [(set DstRC:$dst, (Int SrcRC:$src))], d>,
+           Sched<[sched]>;
+  def rm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                 [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>,
+           Sched<[sched.Folded]>;
 }
 
 multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
                     RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
                     PatFrag ld_frag, string asm, Domain d> {
-  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
-                  (ins DstRC:$src1, SrcRC:$src2), asm,
-                  [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>,
-                  Sched<[WriteCvtI2PS]>;
-  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
-                  (ins DstRC:$src1, x86memop:$src2), asm,
-                  [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>,
-                  Sched<[WriteCvtI2PS.Folded]>;
+  def rr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
+                 (ins DstRC:$src1, SrcRC:$src2), asm,
+                 [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>,
+                 Sched<[WriteCvtI2PS]>;
+  def rm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
+                 (ins DstRC:$src1, x86memop:$src2), asm,
+                 [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>,
+                 Sched<[WriteCvtI2PS.Folded]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -569,14 +569,14 @@ def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
           (MMX_MOVFR642Qrr FR64:$src)>;
 def : Pat<(x86mmx (MMX_X86movdq2q
                    (bc_v2i64 (v4i32 (X86cvtp2Int (v4f32 VR128:$src)))))),
-          (MMX_CVTPS2PIirr VR128:$src)>;
+          (MMX_CVTPS2PIrr VR128:$src)>;
 def : Pat<(x86mmx (MMX_X86movdq2q
                    (bc_v2i64 (v4i32 (X86cvttp2si (v4f32 VR128:$src)))))),
-          (MMX_CVTTPS2PIirr VR128:$src)>;
+          (MMX_CVTTPS2PIrr VR128:$src)>;
 def : Pat<(x86mmx (MMX_X86movdq2q
                    (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
-          (MMX_CVTPD2PIirr VR128:$src)>;
+          (MMX_CVTPD2PIrr VR128:$src)>;
 def : Pat<(x86mmx (MMX_X86movdq2q
                    (bc_v2i64 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
-          (MMX_CVTTPD2PIirr VR128:$src)>;
+          (MMX_CVTTPD2PIrr VR128:$src)>;
 }
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index c3cd634612a4..9044f10ec630 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
 #include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
+#include <string>
 
 using namespace llvm;
 
@@ -1336,235 +1337,29 @@ void X86AsmPrinter::LowerASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
     return;
   }
 
-  unsigned Reg = MI.getOperand(0).getReg().id();
+  const auto &Reg = MI.getOperand(0).getReg();
   ASanAccessInfo AccessInfo(MI.getOperand(1).getImm());
 
-  MCSymbol *&Sym =
-      AsanMemaccessSymbols[AsanMemaccessTuple(Reg, AccessInfo.Packed)];
-  if (!Sym) {
-    std::string Name = AccessInfo.IsWrite ? "store" : "load";
-    std::string SymName = "__asan_check_" + Name +
-                          utostr(1ULL << AccessInfo.AccessSizeIndex) + "_rn" +
-                          utostr(Reg);
-    Sym = OutContext.getOrCreateSymbol(SymName);
-  }
-
-  EmitAndCountInstruction(
-      MCInstBuilder(X86::CALL64pcrel32)
-          .addExpr(MCSymbolRefExpr::create(Sym, OutContext)));
-}
-
-void X86AsmPrinter::emitAsanMemaccessPartial(Module &M, unsigned Reg,
-                                             const ASanAccessInfo &AccessInfo,
-                                             MCSubtargetInfo &STI) {
-  assert(AccessInfo.AccessSizeIndex == 0 || AccessInfo.AccessSizeIndex == 1 ||
-         AccessInfo.AccessSizeIndex == 2);
-  assert(Reg != X86::R8);
-
   uint64_t ShadowBase;
   int MappingScale;
   bool OrShadowOffset;
-  getAddressSanitizerParams(
-      Triple(M.getTargetTriple()), M.getDataLayout().getPointerSizeInBits(),
-      AccessInfo.CompileKernel, &ShadowBase, &MappingScale, &OrShadowOffset);
-
-  OutStreamer->emitInstruction(
-      MCInstBuilder(X86::MOV64rr).addReg(X86::R8).addReg(X86::NoRegister + Reg),
-      STI);
-  OutStreamer->emitInstruction(MCInstBuilder(X86::SHR64ri)
-                                   .addReg(X86::R8)
-                                   .addReg(X86::R8)
-                                   .addImm(MappingScale),
-                               STI);
-  if (OrShadowOffset) {
-    OutStreamer->emitInstruction(MCInstBuilder(X86::OR64ri32)
-                                     .addReg(X86::R8)
-                                     .addReg(X86::R8)
-                                     .addImm(ShadowBase),
-                                 STI);
-    OutStreamer->emitInstruction(MCInstBuilder(X86::MOV8rm)
-                                     .addReg(X86::R8B)
-                                     .addReg(X86::R8)
-                                     .addImm(1)
-                                     .addReg(X86::NoRegister)
-                                     .addImm(0)
-                                     .addReg(X86::NoRegister),
-                                 STI);
-    OutStreamer->emitInstruction(
-        MCInstBuilder(X86::TEST8rr).addReg(X86::R8B).addReg(X86::R8B), STI);
-  } else {
-    OutStreamer->emitInstruction(MCInstBuilder(X86::MOVSX32rm8)
-                                     .addReg(X86::R8D)
-                                     .addReg(X86::R8)
-                                     .addImm(1)
-                                     .addReg(X86::NoRegister)
-                                     .addImm(ShadowBase)
-                                     .addReg(X86::NoRegister),
-                                 STI);
-    OutStreamer->emitInstruction(
-        MCInstBuilder(X86::TEST32rr).addReg(X86::R8D).addReg(X86::R8D), STI);
-  }
-  MCSymbol *AdditionalCheck = OutContext.createTempSymbol();
-  OutStreamer->emitInstruction(
-      MCInstBuilder(X86::JCC_1)
-          .addExpr(MCSymbolRefExpr::create(AdditionalCheck, OutContext))
-          .addImm(X86::COND_NE),
-      STI);
-  MCSymbol *ReturnSym = OutContext.createTempSymbol();
-  OutStreamer->emitLabel(ReturnSym);
-  OutStreamer->emitInstruction(MCInstBuilder(getRetOpcode(*Subtarget)), STI);
-
-  // Shadow byte is non-zero so we need to perform additional checks.
-  OutStreamer->emitLabel(AdditionalCheck);
-  OutStreamer->emitInstruction(MCInstBuilder(X86::PUSH64r).addReg(X86::RCX),
-                               STI);
-  OutStreamer->emitInstruction(MCInstBuilder(X86::MOV64rr)
-                                   .addReg(X86::RCX)
-                                   .addReg(X86::NoRegister + Reg),
-                               STI);
-  const size_t Granularity = 1ULL << MappingScale;
-  OutStreamer->emitInstruction(MCInstBuilder(X86::AND32ri8)
-                                   .addReg(X86::NoRegister)
-                                   .addReg(X86::ECX)
-                                   .addImm(Granularity - 1),
-                               STI);
-  if (AccessInfo.AccessSizeIndex == 1) {
-    OutStreamer->emitInstruction(MCInstBuilder(X86::ADD32ri8)
-                                     .addReg(X86::NoRegister)
-                                     .addReg(X86::ECX)
-                                     .addImm(1),
-                                 STI);
-  } else if (AccessInfo.AccessSizeIndex == 2) {
-    OutStreamer->emitInstruction(MCInstBuilder(X86::ADD32ri8)
-                                     .addReg(X86::NoRegister)
-                                     .addReg(X86::ECX)
-                                     .addImm(3),
-                                 STI);
-  }
-
-  OutStreamer->emitInstruction(
-      MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::R8D).addImm(1),
-      STI);
-  OutStreamer->emitInstruction(MCInstBuilder(X86::POP64r).addReg(X86::RCX),
-                               STI);
-  OutStreamer->emitInstruction(
-      MCInstBuilder(X86::JCC_1)
-          .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext))
-          .addImm(X86::COND_L),
-      STI);
-
-  emitAsanReportError(M, Reg, AccessInfo, STI);
-}
-
-void X86AsmPrinter::emitAsanMemaccessFull(Module &M, unsigned Reg,
-                                          const ASanAccessInfo &AccessInfo,
-                                          MCSubtargetInfo &STI) {
-  assert(AccessInfo.AccessSizeIndex == 3 || AccessInfo.AccessSizeIndex == 4);
-  assert(Reg != X86::R8);
-
-  uint64_t ShadowBase;
-  int MappingScale;
-  bool OrShadowOffset;
-  getAddressSanitizerParams(
-      Triple(M.getTargetTriple()), M.getDataLayout().getPointerSizeInBits(),
-      AccessInfo.CompileKernel, &ShadowBase, &MappingScale, &OrShadowOffset);
-
-  OutStreamer->emitInstruction(
-      MCInstBuilder(X86::MOV64rr).addReg(X86::R8).addReg(X86::NoRegister + Reg),
-      STI);
-  OutStreamer->emitInstruction(MCInstBuilder(X86::SHR64ri)
-                                   .addReg(X86::R8)
-                                   .addReg(X86::R8)
-                                   .addImm(MappingScale),
-                               STI);
-  if (OrShadowOffset) {
-    OutStreamer->emitInstruction(MCInstBuilder(X86::OR64ri32)
-                                     .addReg(X86::R8)
-                                     .addReg(X86::R8)
-                                     .addImm(ShadowBase),
-                                 STI);
-    auto OpCode = AccessInfo.AccessSizeIndex == 3 ? X86::CMP8mi : X86::CMP16mi8;
-    OutStreamer->emitInstruction(MCInstBuilder(OpCode)
-                                     .addReg(X86::R8)
-                                     .addImm(1)
-                                     .addReg(X86::NoRegister)
-                                     .addImm(0)
-                                     .addReg(X86::NoRegister)
-                                     .addImm(0),
-                                 STI);
-  } else {
-    auto OpCode = AccessInfo.AccessSizeIndex == 3 ? X86::CMP8mi : X86::CMP16mi8;
-    OutStreamer->emitInstruction(MCInstBuilder(OpCode)
-                                     .addReg(X86::R8)
-                                     .addImm(1)
-                                     .addReg(X86::NoRegister)
-                                     .addImm(ShadowBase)
-                                     .addReg(X86::NoRegister)
-                                     .addImm(0),
-                                 STI);
-  }
-  MCSymbol *ReportCode = OutContext.createTempSymbol();
-  OutStreamer->emitInstruction(
-      MCInstBuilder(X86::JCC_1)
-          .addExpr(MCSymbolRefExpr::create(ReportCode, OutContext))
-          .addImm(X86::COND_NE),
-      STI);
-  MCSymbol *ReturnSym = OutContext.createTempSymbol();
-  OutStreamer->emitLabel(ReturnSym);
-  OutStreamer->emitInstruction(MCInstBuilder(getRetOpcode(*Subtarget)), STI);
-
-  OutStreamer->emitLabel(ReportCode);
-  emitAsanReportError(M, Reg, AccessInfo, STI);
-}
+  getAddressSanitizerParams(Triple(TM.getTargetTriple()), 64,
+                            AccessInfo.CompileKernel, &ShadowBase,
+                            &MappingScale, &OrShadowOffset);
 
-void X86AsmPrinter::emitAsanReportError(Module &M, unsigned Reg,
-                                        const ASanAccessInfo &AccessInfo,
-                                        MCSubtargetInfo &STI) {
   std::string Name = AccessInfo.IsWrite ? "store" : "load";
-  MCSymbol *ReportError = OutContext.getOrCreateSymbol(
-      "__asan_report_" + Name + utostr(1ULL << AccessInfo.AccessSizeIndex));
-  OutStreamer->emitInstruction(MCInstBuilder(X86::MOV64rr)
-                                   .addReg(X86::RDI)
-                                   .addReg(X86::NoRegister + Reg),
-                               STI);
-  OutStreamer->emitInstruction(
-      MCInstBuilder(X86::JMP_4)
-          .addExpr(MCSymbolRefExpr::create(ReportError, MCSymbolRefExpr::VK_PLT,
-                                           OutContext)),
-      STI);
-}
-
-void X86AsmPrinter::emitAsanMemaccessSymbols(Module &M) {
-  if (AsanMemaccessSymbols.empty())
-    return;
-
-  const Triple &TT = TM.getTargetTriple();
-  assert(TT.isOSBinFormatELF());
-  std::unique_ptr<MCSubtargetInfo> STI(
-      TM.getTarget().createMCSubtargetInfo(TT.str(), "", ""));
-  assert(STI && "Unable to create subtarget info");
-
-  for (auto &P : AsanMemaccessSymbols) {
-    MCSymbol *Sym = P.second;
-    OutStreamer->SwitchSection(OutContext.getELFSection(
-        ".text.hot", ELF::SHT_PROGBITS,
-        ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, Sym->getName(),
-        /*IsComdat=*/true));
-
-    OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
-    OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak);
-    OutStreamer->emitSymbolAttribute(Sym, MCSA_Hidden);
-    OutStreamer->emitLabel(Sym);
+  std::string Op = OrShadowOffset ? "or" : "add";
+  std::string SymName = "__asan_check_" + Name + "_" + Op + "_" +
+                        utostr(1ULL << AccessInfo.AccessSizeIndex) + "_" +
+                        TM.getMCRegisterInfo()->getName(Reg.asMCReg());
+  if (OrShadowOffset)
+    report_fatal_error(
+        "OrShadowOffset is not supported with optimized callbacks");
 
-    unsigned Reg = std::get<0>(P.first);
-    ASanAccessInfo AccessInfo(std::get<1>(P.first));
-
-    if (AccessInfo.AccessSizeIndex < 3) {
-      emitAsanMemaccessPartial(M, Reg, AccessInfo, *STI);
-    } else {
-      emitAsanMemaccessFull(M, Reg, AccessInfo, *STI);
-    }
-  }
+  EmitAndCountInstruction(
+      MCInstBuilder(X86::CALL64pcrel32)
+          .addExpr(MCSymbolRefExpr::create(
+              OutContext.getOrCreateSymbol(SymName), OutContext)));
 }
 
 void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
@@ -2615,6 +2410,15 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   const X86RegisterInfo *RI =
       MF->getSubtarget<X86Subtarget>().getRegisterInfo();
 
+  if (MI->getOpcode() == X86::OR64rm) {
+    for (auto &Opd : MI->operands()) {
+      if (Opd.isSymbol() && StringRef(Opd.getSymbolName()) ==
+                                "swift_async_extendedFramePointerFlags") {
+        ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = true;
+      }
+    }
+  }
+
   // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
   // are compressed from EVEX encoding to VEX encoding.
   if (TM.Options.MCOptions.ShowMCEncoding) {
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index d835f452b67e..1b704bcb8e08 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -430,11 +430,11 @@ def GR64 : RegisterClass<"X86", [i64], 64,
                          (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
                               RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
 
-// GR64 - 64-bit GPRs without R8 and RIP. Could be used when emitting code for
-// intrinsics, which use implict input registers.
-def GR64NoR8 : RegisterClass<"X86", [i64], 64,
-                              (add RAX, RCX, RDX, RSI, RDI, R9, R10, R11,
-                                   RBX, R14, R15, R12, R13, RBP, RSP)>;
+// GR64PLTSafe - 64-bit GPRs without R10, R11, RSP and RIP. Could be used when
+// emitting code for intrinsics, which use implict input registers.
+def GR64PLTSafe : RegisterClass<"X86", [i64], 64,
+                              (add RAX, RCX, RDX, RSI, RDI, R8, R9,
+                                   RBX, R14, R15, R12, R13, RBP)>;
 
 // Segment registers for use by MOV instructions (and others) that have a
 //   segment register as one operand.  Always contain a 16-bit segment
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 2827981b7fb0..a6ff472aac6f 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -783,7 +783,7 @@ def BWWriteResGroup27 : SchedWriteRes<[BWPort1]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[BWWriteResGroup27], (instrs MMX_CVTPI2PSirr)>;
+def: InstRW<[BWWriteResGroup27], (instrs MMX_CVTPI2PSrr)>;
 def: InstRW<[BWWriteResGroup27], (instregex "P(DEP|EXT)(32|64)rr",
                                             "(V?)CVTDQ2PS(Y?)rr")>;
 
@@ -800,9 +800,9 @@ def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[BWWriteResGroup33], (instrs MMX_PACKSSDWirr,
-                                         MMX_PACKSSWBirr,
-                                         MMX_PACKUSWBirr)>;
+def: InstRW<[BWWriteResGroup33], (instrs MMX_PACKSSDWrr,
+                                         MMX_PACKSSWBrr,
+                                         MMX_PACKUSWBrr)>;
 
 def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> {
   let Latency = 3;
@@ -862,9 +862,9 @@ def BWWriteResGroup42 : SchedWriteRes<[BWPort1,BWPort5]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup42], (instrs MMX_CVTPI2PDirr)>;
-def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVT(T?)PD2PIirr",
-                                            "MMX_CVT(T?)PS2PIirr",
+def: InstRW<[BWWriteResGroup42], (instrs MMX_CVTPI2PDrr)>;
+def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVT(T?)PD2PIrr",
+                                            "MMX_CVT(T?)PS2PIrr",
                                             "(V?)CVTDQ2PDrr",
                                             "(V?)CVTPD2PSrr",
                                             "(V?)CVTSD2SSrr",
@@ -1086,9 +1086,9 @@ def BWWriteResGroup79 : SchedWriteRes<[BWPort5,BWPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[BWWriteResGroup79], (instrs MMX_PACKSSDWirm,
-                                         MMX_PACKSSWBirm,
-                                         MMX_PACKUSWBirm)>;
+def: InstRW<[BWWriteResGroup79], (instrs MMX_PACKSSDWrm,
+                                         MMX_PACKSSWBrm,
+                                         MMX_PACKUSWBrm)>;
 
 def BWWriteResGroup80 : SchedWriteRes<[BWPort23,BWPort0156]> {
   let Latency = 7;
@@ -1155,7 +1155,7 @@ def BWWriteResGroup91 : SchedWriteRes<[BWPort1,BWPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup91], (instrs MMX_CVTPI2PSirm,
+def: InstRW<[BWWriteResGroup91], (instrs MMX_CVTPI2PSrm,
                                          CVTDQ2PSrm,
                                          VCVTDQ2PSrm)>;
 def: InstRW<[BWWriteResGroup91], (instregex "P(DEP|EXT)(32|64)rm")>;
@@ -1236,8 +1236,8 @@ def BWWriteResGroup107 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
 def: InstRW<[BWWriteResGroup107], (instrs CVTPD2PSrm,
                                           CVTPD2DQrm,
                                           CVTTPD2DQrm,
-                                          MMX_CVTPI2PDirm)>;
-def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVT(T?)PD2PIirm",
+                                          MMX_CVTPI2PDrm)>;
+def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVT(T?)PD2PIrm",
                                              "(V?)CVTDQ2PDrm",
                                              "(V?)CVTSD2SSrm")>;
 
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 68961d6245ab..371a9571ae39 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -995,7 +995,7 @@ def HWWriteResGroup12 : SchedWriteRes<[HWPort1,HWPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup12], (instrs MMX_CVTPI2PSirm)>;
+def: InstRW<[HWWriteResGroup12], (instrs MMX_CVTPI2PSrm)>;
 def: InstRW<[HWWriteResGroup12], (instregex "P(DEP|EXT)(32|64)rm")>;
 
 def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> {
@@ -1164,9 +1164,9 @@ def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[HWWriteResGroup36_2], (instrs MMX_PACKSSDWirm,
-                                           MMX_PACKSSWBirm,
-                                           MMX_PACKUSWBirm)>;
+def: InstRW<[HWWriteResGroup36_2], (instrs MMX_PACKSSDWrm,
+                                           MMX_PACKSSWBrm,
+                                           MMX_PACKUSWBrm)>;
 
 def HWWriteResGroup37 : SchedWriteRes<[HWPort23,HWPort0156]> {
   let Latency = 7;
@@ -1240,7 +1240,7 @@ def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[HWWriteResGroup50], (instrs MMX_CVTPI2PSirr)>;
+def: InstRW<[HWWriteResGroup50], (instrs MMX_CVTPI2PSrr)>;
 def: InstRW<[HWWriteResGroup50], (instregex "P(DEP|EXT)(32|64)rr",
                                             "(V?)CVTDQ2PS(Y?)rr")>;
 
@@ -1285,9 +1285,9 @@ def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[HWWriteResGroup57], (instrs MMX_PACKSSDWirr,
-                                         MMX_PACKSSWBirr,
-                                         MMX_PACKUSWBirr)>;
+def: InstRW<[HWWriteResGroup57], (instrs MMX_PACKSSDWrr,
+                                         MMX_PACKSSWBrr,
+                                         MMX_PACKUSWBrr)>;
 
 def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> {
   let Latency = 3;
@@ -1373,11 +1373,11 @@ def HWWriteResGroup73 : SchedWriteRes<[HWPort1,HWPort5]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup73], (instrs MMX_CVTPI2PDirr,
-                                         MMX_CVTPD2PIirr,
-                                         MMX_CVTPS2PIirr,
-                                         MMX_CVTTPD2PIirr,
-                                         MMX_CVTTPS2PIirr)>;
+def: InstRW<[HWWriteResGroup73], (instrs MMX_CVTPI2PDrr,
+                                         MMX_CVTPD2PIrr,
+                                         MMX_CVTPS2PIrr,
+                                         MMX_CVTTPD2PIrr,
+                                         MMX_CVTTPS2PIrr)>;
 def: InstRW<[HWWriteResGroup73], (instregex "(V?)CVTDQ2PDrr",
                                             "(V?)CVTPD2PSrr",
                                             "(V?)CVTSD2SSrr",
@@ -1418,8 +1418,8 @@ def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
 def: InstRW<[HWWriteResGroup78], (instrs CVTPD2PSrm,
                                          CVTPD2DQrm,
                                          CVTTPD2DQrm,
-                                         MMX_CVTPD2PIirm,
-                                         MMX_CVTTPD2PIirm,
+                                         MMX_CVTPD2PIrm,
+                                         MMX_CVTTPD2PIrm,
                                          CVTDQ2PDrm,
                                          VCVTDQ2PDrm)>;
 
@@ -1428,7 +1428,7 @@ def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[HWWriteResGroup78_1], (instrs MMX_CVTPI2PDirm,
+def: InstRW<[HWWriteResGroup78_1], (instrs MMX_CVTPI2PDrm,
                                            CVTSD2SSrm, CVTSD2SSrm_Int,
                                            VCVTSD2SSrm, VCVTSD2SSrm_Int)>;
 
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index 889b9b7fa666..789de9eb5751 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -331,12 +331,12 @@ defm : ICXWriteResPair<WriteFLogicZ, [ICXPort05], 1, [1], 1, 7>;
 defm : ICXWriteResPair<WriteFTest,  [ICXPort0], 2, [1], 1, 6>; // Floating point TEST instructions.
 defm : ICXWriteResPair<WriteFTestY, [ICXPort0], 2, [1], 1, 7>;
 defm : ICXWriteResPair<WriteFTestZ, [ICXPort0], 2, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFShuffle,  [ICXPort5], 1, [1], 1, 6>; // Floating point vector shuffles.
-defm : ICXWriteResPair<WriteFShuffleY, [ICXPort5], 1, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFShuffleZ, [ICXPort5], 1, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFVarShuffle,  [ICXPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles.
-defm : ICXWriteResPair<WriteFVarShuffleY, [ICXPort5], 1, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFVarShuffleZ, [ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFShuffle,  [ICXPort15], 1, [1], 1, 6>; // Floating point vector shuffles.
+defm : ICXWriteResPair<WriteFShuffleY, [ICXPort15], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFShuffleZ, [ICXPort5],  1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFVarShuffle,  [ICXPort15], 1, [1], 1, 6>; // Floating point vector variable shuffles.
+defm : ICXWriteResPair<WriteFVarShuffleY, [ICXPort15], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFVarShuffleZ, [ICXPort5],  1, [1], 1, 7>;
 defm : ICXWriteResPair<WriteFBlend, [ICXPort015], 1, [1], 1, 6>; // Floating point vector blends.
 defm : ICXWriteResPair<WriteFBlendY,[ICXPort015], 1, [1], 1, 7>;
 defm : ICXWriteResPair<WriteFBlendZ,[ICXPort015], 1, [1], 1, 7>;
@@ -388,14 +388,14 @@ defm : ICXWriteResPair<WriteVecIMulZ, [ICXPort05],  5, [1], 1, 7>;
 defm : ICXWriteResPair<WritePMULLD,   [ICXPort01], 10, [2], 2, 6>; // Vector PMULLD.
 defm : ICXWriteResPair<WritePMULLDY,  [ICXPort01], 10, [2], 2, 7>;
 defm : ICXWriteResPair<WritePMULLDZ,  [ICXPort05], 10, [2], 2, 7>;
-defm : ICXWriteResPair<WriteShuffle,  [ICXPort5], 1, [1], 1, 5>; // Vector shuffles.
-defm : ICXWriteResPair<WriteShuffleX, [ICXPort5], 1, [1], 1, 6>;
-defm : ICXWriteResPair<WriteShuffleY, [ICXPort5], 1, [1], 1, 7>;
-defm : ICXWriteResPair<WriteShuffleZ, [ICXPort5], 1, [1], 1, 7>;
-defm : ICXWriteResPair<WriteVarShuffle,  [ICXPort5], 1, [1], 1, 5>; // Vector variable shuffles.
-defm : ICXWriteResPair<WriteVarShuffleX, [ICXPort5], 1, [1], 1, 6>;
-defm : ICXWriteResPair<WriteVarShuffleY, [ICXPort5], 1, [1], 1, 7>;
-defm : ICXWriteResPair<WriteVarShuffleZ, [ICXPort5], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteShuffle,  [ICXPort5],  1, [1], 1, 5>; // Vector shuffles.
+defm : ICXWriteResPair<WriteShuffleX, [ICXPort15], 1, [1], 1, 6>;
+defm : ICXWriteResPair<WriteShuffleY, [ICXPort15], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteShuffleZ, [ICXPort5],  1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVarShuffle,  [ICXPort5],  1, [1], 1, 5>; // Vector variable shuffles.
+defm : ICXWriteResPair<WriteVarShuffleX, [ICXPort15], 1, [1], 1, 6>;
+defm : ICXWriteResPair<WriteVarShuffleY, [ICXPort15], 1, [1], 1, 7>;
+defm : ICXWriteResPair<WriteVarShuffleZ, [ICXPort5],  1, [1], 1, 7>;
 defm : ICXWriteResPair<WriteBlend, [ICXPort5], 1, [1], 1, 6>; // Vector blends.
 defm : ICXWriteResPair<WriteBlendY,[ICXPort5], 1, [1], 1, 7>;
 defm : ICXWriteResPair<WriteBlendZ,[ICXPort5], 1, [1], 1, 7>;
@@ -642,15 +642,15 @@ def: InstRW<[ICXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr",
                                             "KXOR(B|D|Q|W)rr",
                                             "KSET0(B|D|Q|W)", // Same as KXOR
                                             "KSET1(B|D|Q|W)", // Same as KXNOR
-                                            "MMX_PADDS(B|W)irr",
-                                            "MMX_PADDUS(B|W)irr",
-                                            "MMX_PAVG(B|W)irr",
-                                            "MMX_PCMPEQ(B|D|W)irr",
-                                            "MMX_PCMPGT(B|D|W)irr",
-                                            "MMX_P(MAX|MIN)SWirr",
-                                            "MMX_P(MAX|MIN)UBirr",
-                                            "MMX_PSUBS(B|W)irr",
-                                            "MMX_PSUBUS(B|W)irr",
+                                            "MMX_PADDS(B|W)rr",
+                                            "MMX_PADDUS(B|W)rr",
+                                            "MMX_PAVG(B|W)rr",
+                                            "MMX_PCMPEQ(B|D|W)rr",
+                                            "MMX_PCMPGT(B|D|W)rr",
+                                            "MMX_P(MAX|MIN)SWrr",
+                                            "MMX_P(MAX|MIN)UBrr",
+                                            "MMX_PSUBS(B|W)rr",
+                                            "MMX_PSUBUS(B|W)rr",
                                             "VPMOVB2M(Z|Z128|Z256)rr",
                                             "VPMOVD2M(Z|Z128|Z256)rr",
                                             "VPMOVQ2M(Z|Z128|Z256)rr",
@@ -663,7 +663,16 @@ def ICXWriteResGroup3 : SchedWriteRes<[ICXPort5]> {
 }
 def: InstRW<[ICXWriteResGroup3], (instregex "COM(P?)_FST0r",
                                             "KMOV(B|D|Q|W)kr",
-                                            "UCOM_F(P?)r")>;
+                                            "UCOM_F(P?)r",
+                                            "VPBROADCAST(D|Q)rr",
+                                            "(V?)INSERTPS(Z?)rr",
+                                            "(V?)MOV(HL|LH)PS(Z?)rr",
+                                            "(V?)MOVDDUP(Y|Z|Z128|Z256)?rr",
+                                            "(V?)PALIGNR(Y|Z|Z128|Z256)?rri",
+                                            "(V?)PERMIL(PD|PS)(Y|Z|Z128|Z256)?ri",
+                                            "(V?)PERMIL(PD|PS)(Y|Z|Z128|Z256)?rr",
+                                            "(V?)PACK(U|S)S(DW|WB)(Y|Z|Z128|Z256)?rr",
+                                            "(V?)UNPCK(L|H)(PD|PS)(Y|Z|Z128|Z256)?rr")>;
 
 def ICXWriteResGroup4 : SchedWriteRes<[ICXPort6]> {
   let Latency = 1;
@@ -702,6 +711,7 @@ def: InstRW<[ICXWriteResGroup9], (instregex "VBLENDMPD(Z128|Z256)rr",
                                             "VBLENDMPS(Z128|Z256)rr",
                                             "VPADD(B|D|Q|W)(Y|Z|Z128|Z256)rr",
                                             "(V?)PADD(B|D|Q|W)rr",
+                                            "(V?)MOV(SD|SS)(Z?)rr",
                                             "VPBLENDD(Y?)rri",
                                             "VPBLENDMB(Z128|Z256)rr",
                                             "VPBLENDMD(Z128|Z256)rr",
@@ -892,9 +902,9 @@ def ICXWriteResGroup41 : SchedWriteRes<[ICXPort5,ICXPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[ICXWriteResGroup41], (instrs MMX_PACKSSDWirr,
-                                          MMX_PACKSSWBirr,
-                                          MMX_PACKUSWBirr)>;
+def: InstRW<[ICXWriteResGroup41], (instrs MMX_PACKSSDWrr,
+                                          MMX_PACKSSWBrr,
+                                          MMX_PACKUSWBrr)>;
 
 def ICXWriteResGroup42 : SchedWriteRes<[ICXPort6,ICXPort0156]> {
   let Latency = 3;
@@ -1055,8 +1065,8 @@ def ICXWriteResGroup61 : SchedWriteRes<[ICXPort5,ICXPort015]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[ICXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIirr",
-                                             "MMX_CVT(T?)PS2PIirr",
+def: InstRW<[ICXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIrr",
+                                             "MMX_CVT(T?)PS2PIrr",
                                              "VCVTDQ2PDZ128rr",
                                              "VCVTPD2DQZ128rr",
                                              "(V?)CVT(T?)PD2DQrr",
@@ -1162,7 +1172,7 @@ def ICXWriteResGroup72 : SchedWriteRes<[ICXPort5]> {
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[ICXWriteResGroup72], (instrs MMX_CVTPI2PSirr)>;
+def: InstRW<[ICXWriteResGroup72], (instrs MMX_CVTPI2PSrr)>;
 def: InstRW<[ICXWriteResGroup72], (instregex "VCOMPRESSPD(Z|Z128|Z256)rr",
                                              "VCOMPRESSPS(Z|Z128|Z256)rr",
                                              "VPCOMPRESSD(Z|Z128|Z256)rr",
@@ -1174,26 +1184,26 @@ def ICXWriteResGroup73 : SchedWriteRes<[ICXPort0,ICXPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[ICXWriteResGroup73], (instrs MMX_PADDSBirm,
-                                          MMX_PADDSWirm,
-                                          MMX_PADDUSBirm,
-                                          MMX_PADDUSWirm,
-                                          MMX_PAVGBirm,
-                                          MMX_PAVGWirm,
-                                          MMX_PCMPEQBirm,
-                                          MMX_PCMPEQDirm,
-                                          MMX_PCMPEQWirm,
-                                          MMX_PCMPGTBirm,
-                                          MMX_PCMPGTDirm,
-                                          MMX_PCMPGTWirm,
-                                          MMX_PMAXSWirm,
-                                          MMX_PMAXUBirm,
-                                          MMX_PMINSWirm,
-                                          MMX_PMINUBirm,
-                                          MMX_PSUBSBirm,
-                                          MMX_PSUBSWirm,
-                                          MMX_PSUBUSBirm,
-                                          MMX_PSUBUSWirm)>;
+def: InstRW<[ICXWriteResGroup73], (instrs MMX_PADDSBrm,
+                                          MMX_PADDSWrm,
+                                          MMX_PADDUSBrm,
+                                          MMX_PADDUSWrm,
+                                          MMX_PAVGBrm,
+                                          MMX_PAVGWrm,
+                                          MMX_PCMPEQBrm,
+                                          MMX_PCMPEQDrm,
+                                          MMX_PCMPEQWrm,
+                                          MMX_PCMPGTBrm,
+                                          MMX_PCMPGTDrm,
+                                          MMX_PCMPGTWrm,
+                                          MMX_PMAXSWrm,
+                                          MMX_PMAXUBrm,
+                                          MMX_PMINSWrm,
+                                          MMX_PMINUBrm,
+                                          MMX_PSUBSBrm,
+                                          MMX_PSUBSWrm,
+                                          MMX_PSUBUSBrm,
+                                          MMX_PSUBUSWrm)>;
 
 def ICXWriteResGroup76 : SchedWriteRes<[ICXPort6,ICXPort23]> {
   let Latency = 6;
@@ -1295,20 +1305,14 @@ def ICXWriteResGroup92 : SchedWriteRes<[ICXPort5,ICXPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[ICXWriteResGroup92], (instregex "VMOVSDZrm(b?)",
-                                             "VMOVSSZrm(b?)")>;
-
-def ICXWriteResGroup92a : SchedWriteRes<[ICXPort5,ICXPort23]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-  let ResourceCycles = [1,1];
-}
-def: InstRW<[ICXWriteResGroup92a], (instregex "(V?)PMOV(SX|ZX)BDrm",
-                                              "(V?)PMOV(SX|ZX)BQrm",
-                                              "(V?)PMOV(SX|ZX)BWrm",
-                                              "(V?)PMOV(SX|ZX)DQrm",
-                                              "(V?)PMOV(SX|ZX)WDrm",
-                                              "(V?)PMOV(SX|ZX)WQrm")>;
+def: InstRW<[ICXWriteResGroup92], (instregex "VMOV(SD|SS)Zrm(b?)",
+                                              "VPBROADCAST(B|W)(Z128)?rm",
+                                             "(V?)INSERTPS(Z?)rm",
+                                             "(V?)PALIGNR(Z128)?rmi",
+                                             "(V?)PERMIL(PD|PS)(Z128)?m(b?)i",
+                                             "(V?)PERMIL(PD|PS)(Z128)?rm",
+                                             "(V?)PACK(U|S)S(DW|WB)(Z128)?rm",
+                                             "(V?)UNPCK(L|H)(PD|PS)(Z128)?rm")>;
 
 def ICXWriteResGroup93 : SchedWriteRes<[ICXPort5,ICXPort015]> {
   let Latency = 7;
@@ -1391,9 +1395,9 @@ def ICXWriteResGroup96 : SchedWriteRes<[ICXPort5,ICXPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[ICXWriteResGroup96], (instrs MMX_PACKSSDWirm,
-                                          MMX_PACKSSWBirm,
-                                          MMX_PACKUSWBirm)>;
+def: InstRW<[ICXWriteResGroup96], (instrs MMX_PACKSSDWrm,
+                                          MMX_PACKSSWBrm,
+                                          MMX_PACKUSWBrm)>;
 
 def ICXWriteResGroup97 : SchedWriteRes<[ICXPort5,ICXPort015]> {
   let Latency = 7;
@@ -1546,7 +1550,12 @@ def ICXWriteResGroup119 : SchedWriteRes<[ICXPort5,ICXPort23]> {
 }
 def: InstRW<[ICXWriteResGroup119], (instregex "FCOM(P?)(32|64)m",
                                               "VPBROADCASTB(Z|Z256)rm(b?)",
-                                              "VPBROADCASTW(Z|Z256)rm(b?)")>;
+                                              "VPBROADCASTW(Z|Z256)rm(b?)",
+                                              "(V?)PALIGNR(Y|Z|Z256)rmi",
+                                              "(V?)PERMIL(PD|PS)(Y|Z|Z256)m(b?)i",
+                                              "(V?)PERMIL(PD|PS)(Y|Z|Z256)rm",
+                                              "(V?)PACK(U|S)S(DW|WB)(Y|Z|Z256)rm",
+                                              "(V?)UNPCK(L|H)(PD|PS)(Y|Z|Z256)rm")>;
 def: InstRW<[ICXWriteResGroup119], (instrs VPBROADCASTBYrm,
                                            VPBROADCASTWYrm,
                                            VPMOVSXBDYrm,
@@ -1683,7 +1692,7 @@ def ICXWriteResGroup135 : SchedWriteRes<[ICXPort0,ICXPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[ICXWriteResGroup135], (instrs MMX_CVTPI2PSirm)>;
+def: InstRW<[ICXWriteResGroup135], (instrs MMX_CVTPI2PSrm)>;
 
 def ICXWriteResGroup136 : SchedWriteRes<[ICXPort5,ICXPort23]> {
   let Latency = 9;
@@ -1709,19 +1718,7 @@ def: InstRW<[ICXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i",
                                               "VPMAXSQZ128rm(b?)",
                                               "VPMAXUQZ128rm(b?)",
                                               "VPMINSQZ128rm(b?)",
-                                              "VPMINUQZ128rm(b?)",
-                                              "VPMOVSXBDZ128rm(b?)",
-                                              "VPMOVSXBQZ128rm(b?)",
-                                              "VPMOVSXBWZ128rm(b?)",
-                                              "VPMOVSXDQZ128rm(b?)",
-                                              "VPMOVSXWDZ128rm(b?)",
-                                              "VPMOVSXWQZ128rm(b?)",
-                                              "VPMOVZXBDZ128rm(b?)",
-                                              "VPMOVZXBQZ128rm(b?)",
-                                              "VPMOVZXBWZ128rm(b?)",
-                                              "VPMOVZXDQZ128rm(b?)",
-                                              "VPMOVZXWDZ128rm(b?)",
-                                              "VPMOVZXWQZ128rm(b?)")>;
+                                              "VPMINUQZ128rm(b?)")>;
 
 def ICXWriteResGroup136_2 : SchedWriteRes<[ICXPort5,ICXPort23]> {
   let Latency = 10;
@@ -1753,7 +1750,7 @@ def ICXWriteResGroup137 : SchedWriteRes<[ICXPort23,ICXPort015]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[ICXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIirm",
+def: InstRW<[ICXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIrm",
                                               "(V?)CVTPS2PDrm")>;
 
 def ICXWriteResGroup143 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> {
@@ -1950,8 +1947,8 @@ def ICXWriteResGroup166 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort015]> {
 def: InstRW<[ICXWriteResGroup166], (instrs CVTPD2PSrm,
                                            CVTPD2DQrm,
                                            CVTTPD2DQrm,
-                                           MMX_CVTPD2PIirm,
-                                           MMX_CVTTPD2PIirm)>;
+                                           MMX_CVTPD2PIrm,
+                                           MMX_CVTTPD2PIrm)>;
 
 def ICXWriteResGroup167 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort015]> {
   let Latency = 11;
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index c8d7b0f72c1c..af5c0540deb5 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -623,7 +623,7 @@ def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> {
 def: InstRW<[SBWriteResGroup5], (instrs MMX_PABSBrr,
                                         MMX_PABSDrr,
                                         MMX_PABSWrr,
-                                        MMX_PADDQirr,
+                                        MMX_PADDQrr,
                                         MMX_PALIGNRrri,
                                         MMX_PSIGNBrr,
                                         MMX_PSIGNDrr,
@@ -870,7 +870,7 @@ def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQirm)>;
+def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQrm)>;
 
 def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> {
   let Latency = 7;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 7d3229c3b023..b3c13c72dd01 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -624,15 +624,15 @@ def SKLWriteResGroup1 : SchedWriteRes<[SKLPort0]> {
   let NumMicroOps = 1;
   let ResourceCycles = [1];
 }
-def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDS(B|W)irr",
-                                            "MMX_PADDUS(B|W)irr",
-                                            "MMX_PAVG(B|W)irr",
-                                            "MMX_PCMPEQ(B|D|W)irr",
-                                            "MMX_PCMPGT(B|D|W)irr",
-                                            "MMX_P(MAX|MIN)SWirr",
-                                            "MMX_P(MAX|MIN)UBirr",
-                                            "MMX_PSUBS(B|W)irr",
-                                            "MMX_PSUBUS(B|W)irr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDS(B|W)rr",
+                                            "MMX_PADDUS(B|W)rr",
+                                            "MMX_PAVG(B|W)rr",
+                                            "MMX_PCMPEQ(B|D|W)rr",
+                                            "MMX_PCMPGT(B|D|W)rr",
+                                            "MMX_P(MAX|MIN)SWrr",
+                                            "MMX_P(MAX|MIN)UBrr",
+                                            "MMX_PSUBS(B|W)rr",
+                                            "MMX_PSUBUS(B|W)rr")>;
 
 def SKLWriteResGroup3 : SchedWriteRes<[SKLPort5]> {
   let Latency = 1;
@@ -815,9 +815,9 @@ def SKLWriteResGroup39 : SchedWriteRes<[SKLPort5,SKLPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKLWriteResGroup39], (instrs MMX_PACKSSDWirr,
-                                          MMX_PACKSSWBirr,
-                                          MMX_PACKUSWBirr)>;
+def: InstRW<[SKLWriteResGroup39], (instrs MMX_PACKSSDWrr,
+                                          MMX_PACKSSWBrr,
+                                          MMX_PACKUSWBrr)>;
 
 def SKLWriteResGroup40 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
   let Latency = 3;
@@ -927,7 +927,7 @@ def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup59], (instrs MMX_CVTPI2PDirr,
+def: InstRW<[SKLWriteResGroup59], (instrs MMX_CVTPI2PDrr,
                                           CVTDQ2PDrr,
                                           VCVTDQ2PDrr)>;
 
@@ -936,8 +936,8 @@ def SKLWriteResGroup60 : SchedWriteRes<[SKLPort5,SKLPort015]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVT(T?)PD2PIirr",
-                                             "MMX_CVT(T?)PS2PIirr",
+def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVT(T?)PD2PIrr",
+                                             "MMX_CVT(T?)PS2PIrr",
                                              "(V?)CVT(T?)PD2DQrr",
                                              "(V?)CVTPD2PSrr",
                                              "(V?)CVTPS2PDrr",
@@ -984,33 +984,33 @@ def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> {
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SKLWriteResGroup68], (instrs MMX_CVTPI2PSirr)>;
+def: InstRW<[SKLWriteResGroup68], (instrs MMX_CVTPI2PSrr)>;
 
 def SKLWriteResGroup69 : SchedWriteRes<[SKLPort0,SKLPort23]> {
   let Latency = 6;
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup69], (instrs MMX_PADDSBirm,
-                                          MMX_PADDSWirm,
-                                          MMX_PADDUSBirm,
-                                          MMX_PADDUSWirm,
-                                          MMX_PAVGBirm,
-                                          MMX_PAVGWirm,
-                                          MMX_PCMPEQBirm,
-                                          MMX_PCMPEQDirm,
-                                          MMX_PCMPEQWirm,
-                                          MMX_PCMPGTBirm,
-                                          MMX_PCMPGTDirm,
-                                          MMX_PCMPGTWirm,
-                                          MMX_PMAXSWirm,
-                                          MMX_PMAXUBirm,
-                                          MMX_PMINSWirm,
-                                          MMX_PMINUBirm,
-                                          MMX_PSUBSBirm,
-                                          MMX_PSUBSWirm,
-                                          MMX_PSUBUSBirm,
-                                          MMX_PSUBUSWirm)>;
+def: InstRW<[SKLWriteResGroup69], (instrs MMX_PADDSBrm,
+                                          MMX_PADDSWrm,
+                                          MMX_PADDUSBrm,
+                                          MMX_PADDUSWrm,
+                                          MMX_PAVGBrm,
+                                          MMX_PAVGWrm,
+                                          MMX_PCMPEQBrm,
+                                          MMX_PCMPEQDrm,
+                                          MMX_PCMPEQWrm,
+                                          MMX_PCMPGTBrm,
+                                          MMX_PCMPGTDrm,
+                                          MMX_PCMPGTWrm,
+                                          MMX_PMAXSWrm,
+                                          MMX_PMAXUBrm,
+                                          MMX_PMINSWrm,
+                                          MMX_PMINUBrm,
+                                          MMX_PSUBSBrm,
+                                          MMX_PSUBSWrm,
+                                          MMX_PSUBUSBrm,
+                                          MMX_PSUBUSWrm)>;
 
 def SKLWriteResGroup70 : SchedWriteRes<[SKLPort0,SKLPort01]> {
   let Latency = 6;
@@ -1144,9 +1144,9 @@ def SKLWriteResGroup92 : SchedWriteRes<[SKLPort5,SKLPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKLWriteResGroup92], (instrs MMX_PACKSSDWirm,
-                                          MMX_PACKSSWBirm,
-                                          MMX_PACKUSWBirm)>;
+def: InstRW<[SKLWriteResGroup92], (instrs MMX_PACKSSDWrm,
+                                          MMX_PACKSSWBrm,
+                                          MMX_PACKUSWBrm)>;
 
 def SKLWriteResGroup94 : SchedWriteRes<[SKLPort23,SKLPort0156]> {
   let Latency = 7;
@@ -1283,7 +1283,7 @@ def SKLWriteResGroup120 : SchedWriteRes<[SKLPort0,SKLPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup120], (instrs MMX_CVTPI2PSirm)>;
+def: InstRW<[SKLWriteResGroup120], (instrs MMX_CVTPI2PSrm)>;
 
 def SKLWriteResGroup121 : SchedWriteRes<[SKLPort5,SKLPort23]> {
   let Latency = 9;
@@ -1302,7 +1302,7 @@ def SKLWriteResGroup123 : SchedWriteRes<[SKLPort23,SKLPort01]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVT(T?)PS2PIirm",
+def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVT(T?)PS2PIrm",
                                               "(V?)CVTPS2PDrm")>;
 
 def SKLWriteResGroup128 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
@@ -1345,7 +1345,7 @@ def SKLWriteResGroup138 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SKLWriteResGroup138], (instrs MMX_CVTPI2PDirm)>;
+def: InstRW<[SKLWriteResGroup138], (instrs MMX_CVTPI2PDrm)>;
 
 def SKLWriteResGroup139 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> {
   let Latency = 10;
@@ -1425,8 +1425,8 @@ def SKLWriteResGroup152 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> {
 def: InstRW<[SKLWriteResGroup152], (instrs CVTPD2PSrm,
                                            CVTPD2DQrm,
                                            CVTTPD2DQrm,
-                                           MMX_CVTPD2PIirm,
-                                           MMX_CVTTPD2PIirm)>;
+                                           MMX_CVTPD2PIrm,
+                                           MMX_CVTTPD2PIrm)>;
 
 def SKLWriteResGroup154 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
   let Latency = 11;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 1d8417aef41e..74f9da158353 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -634,15 +634,15 @@ def: InstRW<[SKXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr",
                                             "KXOR(B|D|Q|W)rr",
                                             "KSET0(B|D|Q|W)", // Same as KXOR
                                             "KSET1(B|D|Q|W)", // Same as KXNOR
-                                            "MMX_PADDS(B|W)irr",
-                                            "MMX_PADDUS(B|W)irr",
-                                            "MMX_PAVG(B|W)irr",
-                                            "MMX_PCMPEQ(B|D|W)irr",
-                                            "MMX_PCMPGT(B|D|W)irr",
-                                            "MMX_P(MAX|MIN)SWirr",
-                                            "MMX_P(MAX|MIN)UBirr",
-                                            "MMX_PSUBS(B|W)irr",
-                                            "MMX_PSUBUS(B|W)irr",
+                                            "MMX_PADDS(B|W)rr",
+                                            "MMX_PADDUS(B|W)rr",
+                                            "MMX_PAVG(B|W)rr",
+                                            "MMX_PCMPEQ(B|D|W)rr",
+                                            "MMX_PCMPGT(B|D|W)rr",
+                                            "MMX_P(MAX|MIN)SWrr",
+                                            "MMX_P(MAX|MIN)UBrr",
+                                            "MMX_PSUBS(B|W)rr",
+                                            "MMX_PSUBUS(B|W)rr",
                                             "VPMOVB2M(Z|Z128|Z256)rr",
                                             "VPMOVD2M(Z|Z128|Z256)rr",
                                             "VPMOVQ2M(Z|Z128|Z256)rr",
@@ -884,9 +884,9 @@ def SKXWriteResGroup41 : SchedWriteRes<[SKXPort5,SKXPort0156]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup41], (instrs MMX_PACKSSDWirr,
-                                          MMX_PACKSSWBirr,
-                                          MMX_PACKUSWBirr)>;
+def: InstRW<[SKXWriteResGroup41], (instrs MMX_PACKSSDWrr,
+                                          MMX_PACKSSWBrr,
+                                          MMX_PACKUSWBrr)>;
 
 def SKXWriteResGroup42 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
   let Latency = 3;
@@ -1047,8 +1047,8 @@ def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIirr",
-                                             "MMX_CVT(T?)PS2PIirr",
+def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIrr",
+                                             "MMX_CVT(T?)PS2PIrr",
                                              "VCVTDQ2PDZ128rr",
                                              "VCVTPD2DQZ128rr",
                                              "(V?)CVT(T?)PD2DQrr",
@@ -1154,7 +1154,7 @@ def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> {
   let NumMicroOps = 2;
   let ResourceCycles = [2];
 }
-def: InstRW<[SKXWriteResGroup72], (instrs MMX_CVTPI2PSirr)>;
+def: InstRW<[SKXWriteResGroup72], (instrs MMX_CVTPI2PSrr)>;
 def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPD(Z|Z128|Z256)rr",
                                              "VCOMPRESSPS(Z|Z128|Z256)rr",
                                              "VPCOMPRESSD(Z|Z128|Z256)rr",
@@ -1166,26 +1166,26 @@ def SKXWriteResGroup73 : SchedWriteRes<[SKXPort0,SKXPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup73], (instrs MMX_PADDSBirm,
-                                          MMX_PADDSWirm,
-                                          MMX_PADDUSBirm,
-                                          MMX_PADDUSWirm,
-                                          MMX_PAVGBirm,
-                                          MMX_PAVGWirm,
-                                          MMX_PCMPEQBirm,
-                                          MMX_PCMPEQDirm,
-                                          MMX_PCMPEQWirm,
-                                          MMX_PCMPGTBirm,
-                                          MMX_PCMPGTDirm,
-                                          MMX_PCMPGTWirm,
-                                          MMX_PMAXSWirm,
-                                          MMX_PMAXUBirm,
-                                          MMX_PMINSWirm,
-                                          MMX_PMINUBirm,
-                                          MMX_PSUBSBirm,
-                                          MMX_PSUBSWirm,
-                                          MMX_PSUBUSBirm,
-                                          MMX_PSUBUSWirm)>;
+def: InstRW<[SKXWriteResGroup73], (instrs MMX_PADDSBrm,
+                                          MMX_PADDSWrm,
+                                          MMX_PADDUSBrm,
+                                          MMX_PADDUSWrm,
+                                          MMX_PAVGBrm,
+                                          MMX_PAVGWrm,
+                                          MMX_PCMPEQBrm,
+                                          MMX_PCMPEQDrm,
+                                          MMX_PCMPEQWrm,
+                                          MMX_PCMPGTBrm,
+                                          MMX_PCMPGTDrm,
+                                          MMX_PCMPGTWrm,
+                                          MMX_PMAXSWrm,
+                                          MMX_PMAXUBrm,
+                                          MMX_PMINSWrm,
+                                          MMX_PMINUBrm,
+                                          MMX_PSUBSBrm,
+                                          MMX_PSUBSWrm,
+                                          MMX_PSUBUSBrm,
+                                          MMX_PSUBUSWrm)>;
 
 def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> {
   let Latency = 6;
@@ -1383,9 +1383,9 @@ def SKXWriteResGroup96 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [2,1];
 }
-def: InstRW<[SKXWriteResGroup96], (instrs MMX_PACKSSDWirm,
-                                          MMX_PACKSSWBirm,
-                                          MMX_PACKUSWBirm)>;
+def: InstRW<[SKXWriteResGroup96], (instrs MMX_PACKSSDWrm,
+                                          MMX_PACKSSWBrm,
+                                          MMX_PACKUSWBrm)>;
 
 def SKXWriteResGroup97 : SchedWriteRes<[SKXPort5,SKXPort015]> {
   let Latency = 7;
@@ -1675,7 +1675,7 @@ def SKXWriteResGroup135 : SchedWriteRes<[SKXPort0,SKXPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup135], (instrs MMX_CVTPI2PSirm)>;
+def: InstRW<[SKXWriteResGroup135], (instrs MMX_CVTPI2PSrm)>;
 
 def SKXWriteResGroup136 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let Latency = 9;
@@ -1701,19 +1701,7 @@ def: InstRW<[SKXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i",
                                               "VPMAXSQZ128rm(b?)",
                                               "VPMAXUQZ128rm(b?)",
                                               "VPMINSQZ128rm(b?)",
-                                              "VPMINUQZ128rm(b?)",
-                                              "VPMOVSXBDZ128rm(b?)",
-                                              "VPMOVSXBQZ128rm(b?)",
-                                              "VPMOVSXBWZ128rm(b?)",
-                                              "VPMOVSXDQZ128rm(b?)",
-                                              "VPMOVSXWDZ128rm(b?)",
-                                              "VPMOVSXWQZ128rm(b?)",
-                                              "VPMOVZXBDZ128rm(b?)",
-                                              "VPMOVZXBQZ128rm(b?)",
-                                              "VPMOVZXBWZ128rm(b?)",
-                                              "VPMOVZXDQZ128rm(b?)",
-                                              "VPMOVZXWDZ128rm(b?)",
-                                              "VPMOVZXWQZ128rm(b?)")>;
+                                              "VPMINUQZ128rm(b?)")>;
 
 def SKXWriteResGroup136_2 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let Latency = 10;
@@ -1745,7 +1733,7 @@ def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIirm",
+def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIrm",
                                               "(V?)CVTPS2PDrm")>;
 
 def SKXWriteResGroup143 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> {
@@ -1942,8 +1930,8 @@ def SKXWriteResGroup166 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
 def: InstRW<[SKXWriteResGroup166], (instrs CVTPD2PSrm,
                                            CVTPD2DQrm,
                                            CVTTPD2DQrm,
-                                           MMX_CVTPD2PIirm,
-                                           MMX_CVTTPD2PIirm)>;
+                                           MMX_CVTPD2PIrm,
+                                           MMX_CVTTPD2PIrm)>;
 
 def SKXWriteResGroup167 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
   let Latency = 11;
diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
index 6fd98280f560..0fedfc01092c 100644
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -320,30 +320,30 @@ defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
 // Conversions.
 ////////////////////////////////////////////////////////////////////////////////
 
-defm : AtomWriteResPair<WriteCvtSS2I,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  8,  9,  [7,7],  [6,6]>;
-defm : AtomWriteResPair<WriteCvtPS2I,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6]>;
+defm : AtomWriteResPair<WriteCvtSS2I,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  8,  9,  [8,8],  [9,9], 3, 4>;
+defm : AtomWriteResPair<WriteCvtPS2I,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [6,6],  [7,7], 3, 4>;
 defm : X86WriteResPairUnsupported<WriteCvtPS2IY>;
 defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
-defm : AtomWriteResPair<WriteCvtSD2I,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  8,  9,  [7,7],  [6,6]>;
-defm : AtomWriteResPair<WriteCvtPD2I,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  7,  8,  [6,6],  [7,7]>;
+defm : AtomWriteResPair<WriteCvtSD2I,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  8,  9,  [8,8],[10,10], 3, 4>;
+defm : AtomWriteResPair<WriteCvtPD2I,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  7,  8,  [7,7],  [8,8], 4, 5>;
 defm : X86WriteResPairUnsupported<WriteCvtPD2IY>;
 defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
 
-defm : AtomWriteResPair<WriteCvtI2SS,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6]>;
-defm : AtomWriteResPair<WriteCvtI2PS,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6]>;
+defm : AtomWriteResPair<WriteCvtI2SS,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [6,6],  [6,6], 3, 1>;
+defm : AtomWriteResPair<WriteCvtI2PS,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [6,6],  [7,7], 3, 4>;
 defm : X86WriteResPairUnsupported<WriteCvtI2PSY>;
 defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
-defm : AtomWriteResPair<WriteCvtI2SD,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6]>;
-defm : AtomWriteResPair<WriteCvtI2PD,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  7,  8,  [6,6],  [7,7]>;
+defm : AtomWriteResPair<WriteCvtI2SD,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [6,6],  [7,7], 3, 3>;
+defm : AtomWriteResPair<WriteCvtI2PD,   [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  7,  8,  [6,6],  [7,7], 3, 4>;
 defm : X86WriteResPairUnsupported<WriteCvtI2PDY>;
 defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
 
-defm : AtomWriteResPair<WriteCvtSS2SD,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6]>;
-defm : AtomWriteResPair<WriteCvtPS2PD,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  7,  8,  [6,6],  [7,7]>;
+defm : AtomWriteResPair<WriteCvtSS2SD,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [6,6],  [7,7], 3, 4>;
+defm : AtomWriteResPair<WriteCvtPS2PD,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  7,  8,  [6,6],  [7,7], 4, 5>;
 defm : X86WriteResPairUnsupported<WriteCvtPS2PDY>;
 defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
-defm : AtomWriteResPair<WriteCvtSD2SS,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  6,  7,  [5,5],  [6,6]>;
-defm : AtomWriteResPair<WriteCvtPD2PS,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1],  7,  8,  [6,6],  [7,7]>;
+defm : AtomWriteResPair<WriteCvtSD2SS,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 10, 11,[10,10],[12,12], 3, 4>;
+defm : AtomWriteResPair<WriteCvtPD2PS,  [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 11, 12,[11,11],[12,12], 4, 5>;
 defm : X86WriteResPairUnsupported<WriteCvtPD2PSY>;
 defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
 
@@ -525,8 +525,8 @@ def AtomWrite1_5 : SchedWriteRes<[AtomPort1]> {
   let Latency = 5;
   let ResourceCycles = [5];
 }
-def : InstRW<[AtomWrite1_5], (instrs MMX_CVTPI2PSirr, MMX_CVTPI2PSirm,
-                                     MMX_CVTPS2PIirr, MMX_CVTTPS2PIirr)>;
+def : InstRW<[AtomWrite1_5], (instrs MMX_CVTPI2PSrr, MMX_CVTPI2PSrm,
+                                     MMX_CVTPS2PIrr, MMX_CVTTPS2PIrr)>;
 
 // Port0 and Port1
 def AtomWrite0_1_1 : SchedWriteRes<[AtomPort0, AtomPort1]> {
@@ -547,9 +547,43 @@ def AtomWrite0_1_5 : SchedWriteRes<[AtomPort0, AtomPort1]> {
   let Latency = 5;
   let ResourceCycles = [5, 5];
 }
-def : InstRW<[AtomWrite0_1_5], (instrs MMX_CVTPS2PIirm, MMX_CVTTPS2PIirm)>;
+def : InstRW<[AtomWrite0_1_5], (instrs MMX_CVTPS2PIrm, MMX_CVTTPS2PIrm)>;
 def : InstRW<[AtomWrite0_1_5], (instregex "ILD_F(16|32|64)")>;
 
+def AtomWrite0_1_7 : SchedWriteRes<[AtomPort0,AtomPort1]> {
+  let Latency = 7;
+  let ResourceCycles = [6,6];
+}
+def : InstRW<[AtomWrite0_1_7], (instregex "CVTSI642SDrm(_Int)?")>;
+
+def AtomWrite0_1_7_4 : SchedWriteRes<[AtomPort0,AtomPort1]> {
+  let Latency = 7;
+  let ResourceCycles = [8,8];
+  let NumMicroOps = 4;
+}
+def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrr(_Int)?")>;
+
+def AtomWrite0_1_8_4 : SchedWriteRes<[AtomPort0,AtomPort1]> {
+  let Latency = 8;
+  let ResourceCycles = [8,8];
+  let NumMicroOps = 4;
+}
+def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrm(_Int)?")>;
+
+def AtomWrite0_1_9 : SchedWriteRes<[AtomPort0,AtomPort1]> {
+  let Latency = 9;
+  let ResourceCycles = [9,9];
+  let NumMicroOps = 4;
+}
+def : InstRW<[AtomWrite0_1_9], (instregex "CVT(T)?SS2SI64rr(_Int)?")>;
+
+def AtomWrite0_1_10 : SchedWriteRes<[AtomPort0,AtomPort1]> {
+  let Latency = 10;
+  let ResourceCycles = [11,11];
+  let NumMicroOps = 5;
+}
+def : InstRW<[AtomWrite0_1_10], (instregex "CVT(T)?SS2SI64rm(_Int)?")>;
+
 // Port0 or Port1
 def AtomWrite01_1 : SchedWriteRes<[AtomPort01]> {
   let Latency = 1;
@@ -570,7 +604,7 @@ def : InstRW<[AtomWrite01_2], (instrs LEAVE, LEAVE64, POP16r,
                                       SCASB, SCASL, SCASQ, SCASW)>;
 def : InstRW<[AtomWrite01_2], (instregex "PUSH(CS|DS|ES|FS|GS|SS)(16|32|64)",
                                          "(ST|ISTT)_F(P)?(16|32|64)?(m|rr)",
-                                         "MMX_P(ADD|SUB)Qirr",
+                                         "MMX_P(ADD|SUB)Qrr",
                                          "MOV(S|Z)X16rr8",
                                          "MOV(UPS|UPD|DQU)mr",
                                          "MASKMOVDQU(64)?",
@@ -589,7 +623,7 @@ def : InstRW<[AtomWrite01_3], (instregex "XADD(8|16|32|64)rm",
                                          "XCHG(8|16|32|64)rm",
                                          "PH(ADD|SUB)Drr",
                                          "MOV(S|Z)X16rm8",
-                                         "MMX_P(ADD|SUB)Qirm",
+                                         "MMX_P(ADD|SUB)Qrm",
                                          "MOV(UPS|UPD|DQU)rm",
                                          "P(ADD|SUB)Qrm")>;
 
@@ -647,15 +681,13 @@ def : InstRW<[AtomWrite01_9], (instrs POPA16, POPA32,
                                       SHLD64mri8, SHRD64mri8,
                                       SHLD64rri8, SHRD64rri8,
                                       CMPXCHG8rr)>;
-def : InstRW<[AtomWrite01_9], (instregex "(U)?COM_FI", "TST_F",
-                                         "CVT(T)?SS2SI64rr(_Int)?")>;
+def : InstRW<[AtomWrite01_9], (instregex "(U)?COM_FI", "TST_F")>;
 
 def AtomWrite01_10 : SchedWriteRes<[AtomPort01]> {
   let Latency = 10;
   let ResourceCycles = [10];
 }
 def : SchedAlias<WriteFLDC, AtomWrite01_10>;
-def : InstRW<[AtomWrite01_10], (instregex "CVT(T)?SS2SI64rm(_Int)?")>;
 
 def AtomWrite01_11 : SchedWriteRes<[AtomPort01]> {
   let Latency = 11;
diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index 4c16b5b52b1d..0f6f24f9f1fe 100644
--- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -1008,11 +1008,11 @@ defm : PdWriteResXMMPair<WriteCvtPD2I,   [PdFPU0, PdFPCVT, PdFPSTO],          8,
 defm : PdWriteResYMMPair<WriteCvtPD2IY,  [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
 defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
 
-def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
+def PdWriteMMX_CVTTPD2PIrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
   let Latency = 6;
   let NumMicroOps = 2;
 }
-def : InstRW<[PdWriteMMX_CVTTPD2PIirr], (instrs MMX_CVTTPD2PIirr)>;
+def : InstRW<[PdWriteMMX_CVTTPD2PIrr], (instrs MMX_CVTTPD2PIrr)>;
 
 // FIXME: f+3 ST, LD+STC latency
 defm : PdWriteResXMMPair<WriteCvtI2SS,   [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
@@ -1048,18 +1048,18 @@ defm : PdWriteResXMMPair<WriteCvtPD2PS,  [PdFPU0, PdFPCVT, PdFPSTO],          8,
 defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
 defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
 
-def PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
+def PdWriteMMX_CVTPD2PIrrMMX_CVTPI2PDrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
   let Latency = 6;
   let NumMicroOps = 2;
 }
-def : InstRW<[PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr,
-                                                            MMX_CVTPI2PDirr)>;
+def : InstRW<[PdWriteMMX_CVTPD2PIrrMMX_CVTPI2PDrr], (instrs MMX_CVTPD2PIrr,
+                                                            MMX_CVTPI2PDrr)>;
 
-def PdWriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
+def PdWriteMMX_CVTPI2PSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
   let Latency = 4;
   let NumMicroOps = 2;
 }
-def : InstRW<[PdWriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>;
+def : InstRW<[PdWriteMMX_CVTPI2PSrr], (instrs MMX_CVTPI2PSrr)>;
 
 defm : PdWriteResXMMPair<WriteCvtPH2PS,  [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2, 1>;
 defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 3>;
@@ -1365,7 +1365,7 @@ def PdWriteVZeroIdiomLogic : SchedWriteVariant<[
   SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
   SchedVar<MCSchedPredicate<TruePred>,           [WriteVecLogic]>
 ]>;
-def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
+def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>;
 
 def PdWriteVZeroIdiomLogicX : SchedWriteVariant<[
   SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
@@ -1378,11 +1378,11 @@ def PdWriteVZeroIdiomALU : SchedWriteVariant<[
   SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
   SchedVar<MCSchedPredicate<TruePred>,           [WriteVecALU]>
 ]>;
-def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBirr,   MMX_PSUBDirr,
-                                             MMX_PSUBQirr,   MMX_PSUBWirr,
-                                             MMX_PCMPGTBirr,
-                                             MMX_PCMPGTDirr,
-                                             MMX_PCMPGTWirr)>;
+def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBrr,   MMX_PSUBDrr,
+                                             MMX_PSUBQrr,   MMX_PSUBWrr,
+                                             MMX_PCMPGTBrr,
+                                             MMX_PCMPGTDrr,
+                                             MMX_PCMPGTWrr)>;
 
 def PdWriteVZeroIdiomALUX : SchedWriteVariant<[
     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
@@ -1408,10 +1408,10 @@ def : IsZeroIdiomFunction<[
 
   // MMX Zero-idioms.
   DepBreakingClass<[
-    MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
-    MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
-    MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
-    MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
+    MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr,
+    MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr,
+    MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr,
+    MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr
   ], ZeroIdiomPredicate>,
 
   // SSE Zero-idioms.
@@ -1449,7 +1449,7 @@ def : IsDepBreakingFunction<[
 
   // MMX
   DepBreakingClass<[
-    MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
+    MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr
   ], ZeroIdiomPredicate>,
 
   // SSE
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 68ebaa244acf..a070da34cab5 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -888,7 +888,7 @@ def JWriteVZeroIdiomLogic : SchedWriteVariant<[
     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
     SchedVar<NoSchedPred,                          [WriteVecLogic]>
 ]>;
-def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
+def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>;
 
 def JWriteVZeroIdiomLogicX : SchedWriteVariant<[
     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
@@ -901,12 +901,12 @@ def JWriteVZeroIdiomALU : SchedWriteVariant<[
     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
     SchedVar<NoSchedPred,                          [WriteVecALU]>
 ]>;
-def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr,
-                                            MMX_PSUBQirr, MMX_PSUBWirr,
-                                            MMX_PSUBSBirr, MMX_PSUBSWirr,
-                                            MMX_PSUBUSBirr, MMX_PSUBUSWirr,
-                                            MMX_PCMPGTBirr, MMX_PCMPGTDirr,
-                                            MMX_PCMPGTWirr)>;
+def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBrr, MMX_PSUBDrr,
+                                            MMX_PSUBQrr, MMX_PSUBWrr,
+                                            MMX_PSUBSBrr, MMX_PSUBSWrr,
+                                            MMX_PSUBUSBrr, MMX_PSUBUSWrr,
+                                            MMX_PCMPGTBrr, MMX_PCMPGTDrr,
+                                            MMX_PCMPGTWrr)>;
 
 def JWriteVZeroIdiomALUX : SchedWriteVariant<[
     SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>,
@@ -974,10 +974,10 @@ def : IsZeroIdiomFunction<[
 
   // MMX Zero-idioms.
   DepBreakingClass<[
-    MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
-    MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
-    MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
-    MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
+    MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr,
+    MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr,
+    MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr,
+    MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr
   ], ZeroIdiomPredicate>,
 
   // SSE Zero-idioms.
@@ -1017,7 +1017,7 @@ def : IsDepBreakingFunction<[
 
   // MMX
   DepBreakingClass<[
-    MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
+    MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr
   ], ZeroIdiomPredicate>,
 
   // SSE
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index 5af9835f75a7..36e5b55a4194 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -467,8 +467,8 @@ def SLMWriteResGroup1rr : SchedWriteRes<[SLM_FPC_RSV01]> {
   let NumMicroOps = 2;
   let ResourceCycles = [8];
 }
-def: InstRW<[SLMWriteResGroup1rr], (instrs MMX_PADDQirr, PADDQrr,
-                                           MMX_PSUBQirr, PSUBQrr,
+def: InstRW<[SLMWriteResGroup1rr], (instrs MMX_PADDQrr, PADDQrr,
+                                           MMX_PSUBQrr, PSUBQrr,
                                            PCMPEQQrr)>;
 
 def SLMWriteResGroup1rm : SchedWriteRes<[SLM_MEC_RSV,SLM_FPC_RSV01]> {
@@ -476,8 +476,8 @@ def SLMWriteResGroup1rm : SchedWriteRes<[SLM_MEC_RSV,SLM_FPC_RSV01]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,8];
 }
-def: InstRW<[SLMWriteResGroup1rm], (instrs MMX_PADDQirm, PADDQrm,
-                                           MMX_PSUBQirm, PSUBQrm,
+def: InstRW<[SLMWriteResGroup1rm], (instrs MMX_PADDQrm, PADDQrm,
+                                           MMX_PSUBQrm, PSUBQrm,
                                            PCMPEQQrm)>;
 
 } // SchedModel
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 8e30e5e10ca8..4343e1ed45d1 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -1000,12 +1000,12 @@ def ZnWriteFPU12Ym : SchedWriteRes<[ZnAGU, ZnFPU12]> {
   let NumMicroOps = 2;
 }
 
-def : InstRW<[ZnWriteFPU12], (instrs MMX_PACKSSDWirr,
-                                     MMX_PACKSSWBirr,
-                                     MMX_PACKUSWBirr)>;
-def : InstRW<[ZnWriteFPU12m], (instrs MMX_PACKSSDWirm,
-                                      MMX_PACKSSWBirm,
-                                      MMX_PACKUSWBirm)>;
+def : InstRW<[ZnWriteFPU12], (instrs MMX_PACKSSDWrr,
+                                     MMX_PACKSSWBrr,
+                                     MMX_PACKUSWBrr)>;
+def : InstRW<[ZnWriteFPU12m], (instrs MMX_PACKSSDWrm,
+                                      MMX_PACKSSWBrm,
+                                      MMX_PACKUSWBrm)>;
 
 def ZnWriteFPU013 : SchedWriteRes<[ZnFPU013]> ;
 def ZnWriteFPU013Y : SchedWriteRes<[ZnFPU013]> {
@@ -1305,15 +1305,15 @@ def ZnWriteCVTPS2PIr: SchedWriteRes<[ZnFPU3]> {
 }
 // CVT(T)PS2PI.
 // mm,x.
-def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>;
+def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIrr")>;
 
 // CVTPI2PD.
 // x,mm.
-def : InstRW<[ZnWriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>;
+def : InstRW<[ZnWriteCVTPS2PDr], (instrs MMX_CVTPI2PDrr)>;
 
 // CVT(T)PD2PI.
 // mm,x.
-def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>;
+def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIrr")>;
 
 def ZnWriteCVSTSI2SSr: SchedWriteRes<[ZnFPU3]> {
   let Latency = 5;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
index a83c89e2f28a..96d2837880c7 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -1012,12 +1012,12 @@ def Zn2WriteFPU12Ym : SchedWriteRes<[Zn2AGU, Zn2FPU12]> {
   let NumMicroOps = 2;
 }
 
-def : InstRW<[Zn2WriteFPU12], (instrs MMX_PACKSSDWirr,
-                                     MMX_PACKSSWBirr,
-                                     MMX_PACKUSWBirr)>;
-def : InstRW<[Zn2WriteFPU12m], (instrs MMX_PACKSSDWirm,
-                                      MMX_PACKSSWBirm,
-                                      MMX_PACKUSWBirm)>;
+def : InstRW<[Zn2WriteFPU12], (instrs MMX_PACKSSDWrr,
+                                     MMX_PACKSSWBrr,
+                                     MMX_PACKUSWBrr)>;
+def : InstRW<[Zn2WriteFPU12m], (instrs MMX_PACKSSDWrm,
+                                      MMX_PACKSSWBrm,
+                                      MMX_PACKUSWBrm)>;
 
 def Zn2WriteFPU013 : SchedWriteRes<[Zn2FPU013]> ;
 def Zn2WriteFPU013Y : SchedWriteRes<[Zn2FPU013]> ;
@@ -1304,15 +1304,15 @@ def Zn2WriteCVTPS2PIr: SchedWriteRes<[Zn2FPU3]> {
 }
 // CVT(T)PS2PI.
 // mm,x.
-def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>;
+def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIrr")>;
 
 // CVTPI2PD.
 // x,mm.
-def : InstRW<[Zn2WriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>;
+def : InstRW<[Zn2WriteCVTPS2PDr], (instrs MMX_CVTPI2PDrr)>;
 
 // CVT(T)PD2PI.
 // mm,x.
-def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>;
+def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIrr")>;
 
 def Zn2WriteCVSTSI2SSr: SchedWriteRes<[Zn2FPU3]> {
   let Latency = 3;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index be07c069aae1..f4e03ac11f0b 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -1075,9 +1075,9 @@ def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> {
 }
 def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr,
                                            MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr,
-                                           MMX_PADDSBirr, MMX_PADDSWirr, MMX_PADDUSBirr, MMX_PADDUSWirr,
-                                           MMX_PAVGBirr, MMX_PAVGWirr,
-                                           MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr)>;
+                                           MMX_PADDSBrr, MMX_PADDSWrr, MMX_PADDUSBrr, MMX_PADDUSWrr,
+                                           MMX_PAVGBrr, MMX_PAVGWrr,
+                                           MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr)>;
 
 defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
 
@@ -1161,7 +1161,7 @@ def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> {
   let ResourceCycles = [2];
   let NumMicroOps = 2;
 }
-def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIirm, MMX_CVTTPD2PIirm, MMX_CVTPD2PIirr, MMX_CVTTPD2PIirr)>;
+def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIrm, MMX_CVTTPD2PIrm, MMX_CVTPD2PIrr, MMX_CVTTPD2PIrr)>;
 
 defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>;  // Float -> Integer.
 
@@ -1179,7 +1179,7 @@ def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> {
   let ResourceCycles = [6];
   let NumMicroOps = 2;
 }
-def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDirm, MMX_CVTPI2PDirr)>;
+def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDrm, MMX_CVTPI2PDrr)>;
 
 defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Float.
 defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
@@ -1191,7 +1191,7 @@ def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> {
   let ResourceCycles = [1];
   let NumMicroOps = 2;
 }
-def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSirr)>;
+def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSrr)>;
 
 defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>;  // Float -> Double size conversion.
 defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
@@ -1621,7 +1621,7 @@ def : IsDepBreakingFunction<[
 
   // MMX
   DepBreakingClass<[
-    MMX_PCMPEQBirr, MMX_PCMPEQWirr, MMX_PCMPEQDirr
+    MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr
   ], ZeroIdiomPredicate>,
 
   // SSE
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 9da54dc2e9b7..5d773f0c57df 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -958,8 +958,7 @@ public:
     // extended frames should be flagged as present.
     const Triple &TT = getTargetTriple();
 
-    unsigned Major, Minor, Micro;
-    TT.getOSVersion(Major, Minor, Micro);
+    unsigned Major = TT.getOSVersion().getMajor();
     switch(TT.getOS()) {
     default:
       return false;
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 336985f3bf9d..78bc5519c23f 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -588,6 +588,18 @@ void X86PassConfig::addPreEmitPass2() {
 
   // Insert pseudo probe annotation for callsite profiling
   addPass(createPseudoProbeInserter());
+
+  // On Darwin platforms, BLR_RVMARKER pseudo instructions are lowered to
+  // bundles.
+  if (TT.isOSDarwin())
+    addPass(createUnpackMachineBundles([](const MachineFunction &MF) {
+      // Only run bundle expansion if there are relevant ObjC runtime functions
+      // present in the module.
+      const Function &F = MF.getFunction();
+      const Module *M = F.getParent();
+      return M->getFunction("objc_retainAutoreleasedReturnValue") ||
+             M->getFunction("objc_unsafeClaimAutoreleasedReturnValue");
+    }));
 }
 
 bool X86PassConfig::addPostFastRegAllocRewrite() {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 869762b35196..d8cd7311a0d5 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -236,47 +236,50 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     }
   }
 
-  if ((ISD == ISD::MUL || ISD == ISD::SDIV || ISD == ISD::SREM ||
-       ISD == ISD::UDIV || ISD == ISD::UREM) &&
+  // Vector multiply by pow2 will be simplified to shifts.
+  if (ISD == ISD::MUL &&
       (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
        Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
-      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
-    // Vector multiply by pow2 will be simplified to shifts.
-    if (ISD == ISD::MUL) {
-      InstructionCost Cost = getArithmeticInstrCost(
-          Instruction::Shl, Ty, CostKind, Op1Info, Op2Info,
-          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
-      return Cost;
-    }
-
-    if (ISD == ISD::SDIV || ISD == ISD::SREM) {
-      // On X86, vector signed division by constants power-of-two are
-      // normally expanded to the sequence SRA + SRL + ADD + SRA.
-      // The OperandValue properties may not be the same as that of the previous
-      // operation; conservatively assume OP_None.
-      InstructionCost Cost =
-          2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
-                                     Op2Info, TargetTransformInfo::OP_None,
-                                     TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
-                                     Op2Info, TargetTransformInfo::OP_None,
-                                     TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
-                                     Op2Info, TargetTransformInfo::OP_None,
-                                     TargetTransformInfo::OP_None);
+      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2)
+    return getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, Op1Info,
+                                  Op2Info, TargetTransformInfo::OP_None,
+                                  TargetTransformInfo::OP_None);
 
-      if (ISD == ISD::SREM) {
-        // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
-        Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
-                                       Op2Info);
-        Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
-                                       Op2Info);
-      }
+  // On X86, vector signed division by constants power-of-two are
+  // normally expanded to the sequence SRA + SRL + ADD + SRA.
+  // The OperandValue properties may not be the same as that of the previous
+  // operation; conservatively assume OP_None.
+  if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
+      (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+    InstructionCost Cost =
+        2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
+                                   Op2Info, TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
+                                   Op2Info, TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
+                                   Op2Info, TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
 
-      return Cost;
+    if (ISD == ISD::SREM) {
+      // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
+      Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
+                                     Op2Info);
+      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
+                                     Op2Info);
     }
 
-    // Vector unsigned division/remainder will be simplified to shifts/masks.
+    return Cost;
+  }
+
+  // Vector unsigned division/remainder will be simplified to shifts/masks.
+  if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
+      (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
     if (ISD == ISD::UDIV)
       return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
                                     Op2Info, TargetTransformInfo::OP_None,
@@ -660,6 +663,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     { ISD::MUL,     MVT::v8i32,      1 }, // pmulld (Skylake from agner.org)
     { ISD::MUL,     MVT::v4i32,      1 }, // pmulld (Skylake from agner.org)
     { ISD::MUL,     MVT::v8i64,      6 }, // 3*pmuludq/3*shift/2*add
+    { ISD::MUL,     MVT::i64,        1 }, // Skylake from http://www.agner.org/
 
     { ISD::FNEG,    MVT::v8f64,      1 }, // Skylake from http://www.agner.org/
     { ISD::FADD,    MVT::v8f64,      1 }, // Skylake from http://www.agner.org/
@@ -5188,10 +5192,10 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
   return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
 }
 
-bool X86TTIImpl::areFunctionArgsABICompatible(
-    const Function *Caller, const Function *Callee,
-    SmallPtrSetImpl<Argument *> &Args) const {
-  if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
+bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
+                                       const Function *Callee,
+                                       const ArrayRef<Type *> &Types) const {
+  if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
     return false;
 
   // If we get here, we know the target features match. If one function
@@ -5206,13 +5210,8 @@ bool X86TTIImpl::areFunctionArgsABICompatible(
   // Consider the arguments compatible if they aren't vectors or aggregates.
   // FIXME: Look at the size of vectors.
   // FIXME: Look at the element types of aggregates to see if there are vectors.
-  // FIXME: The API of this function seems intended to allow arguments
-  // to be removed from the set, but the caller doesn't check if the set
-  // becomes empty so that may not work in practice.
-  return llvm::none_of(Args, [](Argument *A) {
-    auto *EltTy = cast<PointerType>(A->getType())->getElementType();
-    return EltTy->isVectorTy() || EltTy->isAggregateType();
-  });
+  return llvm::none_of(Types,
+      [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
 }
 
 X86TTIImpl::TTI::MemCmpExpansionOptions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index c53424ec0026..11e9cb09c7d5 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -234,9 +234,8 @@ public:
   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
-  bool areFunctionArgsABICompatible(const Function *Caller,
-                                    const Function *Callee,
-                                    SmallPtrSetImpl<Argument *> &Args) const;
+  bool areTypesABICompatible(const Function *Caller, const Function *Callee,
+                             const ArrayRef<Type *> &Type) const;
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
   bool prefersVectorizedAddressing() const;
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index abac3f801a22..4624b735bef8 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -475,12 +475,12 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
   // any of its operands, this way, when we get to the operand, we already
   // removed the instructions (from the expression dag) that uses it.
   CurrentTruncInst->eraseFromParent();
-  for (auto I = InstInfoMap.rbegin(), E = InstInfoMap.rend(); I != E; ++I) {
+  for (auto &I : llvm::reverse(InstInfoMap)) {
     // We still need to check that the instruction has no users before we erase
     // it, because {SExt, ZExt}Inst Instruction might have other users that was
     // not reduced, in such case, we need to keep that instruction.
-    if (I->first->use_empty())
-      I->first->eraseFromParent();
+    if (I.first->use_empty())
+      I.first->eraseFromParent();
   }
 }
 
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index 96c083a144b2..5fc5295969d0 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -165,6 +165,12 @@ void CFGuard::insertCFGuardCheck(CallBase *CB) {
   IRBuilder<> B(CB);
   Value *CalledOperand = CB->getCalledOperand();
 
+  // If the indirect call is called within catchpad or cleanuppad,
+  // we need to copy "funclet" bundle of the call.
+  SmallVector<llvm::OperandBundleDef, 1> Bundles;
+  if (auto Bundle = CB->getOperandBundle(LLVMContext::OB_funclet))
+    Bundles.push_back(OperandBundleDef(*Bundle));
+
   // Load the global symbol as a pointer to the check function.
   LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFnGlobal);
 
@@ -172,7 +178,7 @@ void CFGuard::insertCFGuardCheck(CallBase *CB) {
   // even if the original CallBase is an Invoke or CallBr instruction.
   CallInst *GuardCheck =
       B.CreateCall(GuardFnType, GuardCheckLoad,
-                   {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())});
+                   {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())}, Bundles);
 
   // Ensure that the first argument is passed in the correct register
   // (e.g. ECX on 32-bit X86 targets).
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index ac3d078714ce..a0d12865bd3a 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1237,8 +1237,10 @@ namespace {
 struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
   using Base = PtrUseVisitor<AllocaUseVisitor>;
   AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT,
-                   const CoroBeginInst &CB, const SuspendCrossingInfo &Checker)
-      : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker) {}
+                   const CoroBeginInst &CB, const SuspendCrossingInfo &Checker,
+                   bool ShouldUseLifetimeStartInfo)
+      : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker),
+        ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {}
 
   void visit(Instruction &I) {
     Users.insert(&I);
@@ -1390,6 +1392,7 @@ private:
   SmallPtrSet<Instruction *, 4> Users{};
   SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{};
   bool MayWriteBeforeCoroBegin{false};
+  bool ShouldUseLifetimeStartInfo{true};
 
   mutable llvm::Optional<bool> ShouldLiveOnFrame{};
 
@@ -1398,7 +1401,7 @@ private:
     // more precise. We look at every pair of lifetime.start intrinsic and
     // every basic block that uses the pointer to see if they cross suspension
     // points. The uses cover both direct uses as well as indirect uses.
-    if (!LifetimeStarts.empty()) {
+    if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) {
       for (auto *I : Users)
         for (auto *S : LifetimeStarts)
           if (Checker.isDefinitionAcrossSuspend(*S, I))
@@ -2484,8 +2487,15 @@ static void collectFrameAllocas(Function &F, coro::Shape &Shape,
       continue;
     }
     DominatorTree DT(F);
+    // The code that uses lifetime.start intrinsic does not work for functions
+    // with loops without exit. Disable it on ABIs we know to generate such
+    // code.
+    bool ShouldUseLifetimeStartInfo =
+        (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
+         Shape.ABI != coro::ABI::RetconOnce);
     AllocaUseVisitor Visitor{F.getParent()->getDataLayout(), DT,
-                             *Shape.CoroBegin, Checker};
+                             *Shape.CoroBegin, Checker,
+                             ShouldUseLifetimeStartInfo};
     Visitor.visitPtr(*AI);
     if (!Visitor.getShouldLiveOnFrame())
       continue;
@@ -2572,9 +2582,15 @@ void coro::salvageDebugInfo(
   DVI->setExpression(Expr);
   /// It makes no sense to move the dbg.value intrinsic.
   if (!isa<DbgValueInst>(DVI)) {
-    if (auto *InsertPt = dyn_cast<Instruction>(Storage))
+    if (auto *II = dyn_cast<InvokeInst>(Storage))
+      DVI->moveBefore(II->getNormalDest()->getFirstNonPHI());
+    else if (auto *CBI = dyn_cast<CallBrInst>(Storage))
+      DVI->moveBefore(CBI->getDefaultDest()->getFirstNonPHI());
+    else if (auto *InsertPt = dyn_cast<Instruction>(Storage)) {
+      assert(!InsertPt->isTerminator() &&
+             "Unimaged terminator that could return a storage.");
       DVI->moveAfter(InsertPt);
-    else if (isa<Argument>(Storage))
+    } else if (isa<Argument>(Storage))
       DVI->moveAfter(F->getEntryBlock().getFirstNonPHI());
   }
 }
@@ -2664,7 +2680,10 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
     }
   }
 
-  sinkLifetimeStartMarkers(F, Shape, Checker);
+  if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
+      Shape.ABI != coro::ABI::RetconOnce)
+    sinkLifetimeStartMarkers(F, Shape, Checker);
+
   if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty())
     collectFrameAllocas(F, Shape, Checker, FrameData.Allocas);
   LLVM_DEBUG(dumpAllocas(FrameData.Allocas));
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index fa1d92f439b8..12c1829524ef 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -280,6 +280,27 @@ static void replaceFallthroughCoroEnd(AnyCoroEndInst *End,
   BB->getTerminator()->eraseFromParent();
 }
 
+// Mark a coroutine as done, which implies that the coroutine is finished and
+// never get resumed.
+//
+// In resume-switched ABI, the done state is represented by storing zero in
+// ResumeFnAddr.
+//
+// NOTE: We couldn't omit the argument `FramePtr`. It is necessary because the
+// pointer to the frame in splitted function is not stored in `Shape`.
+static void markCoroutineAsDone(IRBuilder<> &Builder, const coro::Shape &Shape,
+                                Value *FramePtr) {
+  assert(
+      Shape.ABI == coro::ABI::Switch &&
+      "markCoroutineAsDone is only supported for Switch-Resumed ABI for now.");
+  auto *GepIndex = Builder.CreateStructGEP(
+      Shape.FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Resume,
+      "ResumeFn.addr");
+  auto *NullPtr = ConstantPointerNull::get(cast<PointerType>(
+      Shape.FrameTy->getTypeAtIndex(coro::Shape::SwitchFieldIndex::Resume)));
+  Builder.CreateStore(NullPtr, GepIndex);
+}
+
 /// Replace an unwind call to llvm.coro.end.
 static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape,
                                  Value *FramePtr, bool InResume,
@@ -288,10 +309,18 @@ static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape,
 
   switch (Shape.ABI) {
   // In switch-lowering, this does nothing in the main function.
-  case coro::ABI::Switch:
+  case coro::ABI::Switch: {
+    // In C++'s specification, the coroutine should be marked as done
+    // if promise.unhandled_exception() throws.  The frontend will
+    // call coro.end(true) along this path.
+    //
+    // FIXME: We should refactor this once there is other language
+    // which uses Switch-Resumed style other than C++.
+    markCoroutineAsDone(Builder, Shape, FramePtr);
     if (!InResume)
       return;
     break;
+  }
   // In async lowering this does nothing.
   case coro::ABI::Async:
     break;
@@ -364,13 +393,9 @@ static void createResumeEntryBlock(Function &F, coro::Shape &Shape) {
     auto *Save = S->getCoroSave();
     Builder.SetInsertPoint(Save);
     if (S->isFinal()) {
-      // Final suspend point is represented by storing zero in ResumeFnAddr.
-      auto *GepIndex = Builder.CreateStructGEP(FrameTy, FramePtr,
-                                 coro::Shape::SwitchFieldIndex::Resume,
-                                  "ResumeFn.addr");
-      auto *NullPtr = ConstantPointerNull::get(cast<PointerType>(
-          FrameTy->getTypeAtIndex(coro::Shape::SwitchFieldIndex::Resume)));
-      Builder.CreateStore(NullPtr, GepIndex);
+      // The coroutine should be marked done if it reaches the final suspend
+      // point.
+      markCoroutineAsDone(Builder, Shape, FramePtr);
     } else {
       auto *GepIndex = Builder.CreateStructGEP(
           FrameTy, FramePtr, Shape.getSwitchIndexField(), "index.addr");
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index e4883ef89db7..fba8b03e44ba 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -141,7 +141,6 @@ static bool isCoroutineIntrinsicName(StringRef Name) {
       "llvm.coro.id.retcon",
       "llvm.coro.id.retcon.once",
       "llvm.coro.noop",
-      "llvm.coro.param",
       "llvm.coro.prepare.async",
       "llvm.coro.prepare.retcon",
       "llvm.coro.promise",
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 93bb11433775..3a42a2cac928 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -835,14 +835,20 @@ bool ArgumentPromotionPass::areFunctionArgsABICompatible(
     const Function &F, const TargetTransformInfo &TTI,
     SmallPtrSetImpl<Argument *> &ArgsToPromote,
     SmallPtrSetImpl<Argument *> &ByValArgsToTransform) {
+  // TODO: Check individual arguments so we can promote a subset?
+  SmallVector<Type *, 32> Types;
+  for (Argument *Arg : ArgsToPromote)
+    Types.push_back(Arg->getType()->getPointerElementType());
+  for (Argument *Arg : ByValArgsToTransform)
+    Types.push_back(Arg->getParamByValType());
+
   for (const Use &U : F.uses()) {
     CallBase *CB = dyn_cast<CallBase>(U.getUser());
     if (!CB)
       return false;
     const Function *Caller = CB->getCaller();
     const Function *Callee = CB->getCalledFunction();
-    if (!TTI.areFunctionArgsABICompatible(Caller, Callee, ArgsToPromote) ||
-        !TTI.areFunctionArgsABICompatible(Caller, Callee, ByValArgsToTransform))
+    if (!TTI.areTypesABICompatible(Caller, Callee, Types))
       return false;
   }
   return true;
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index edadc79e3a9f..7e729e57153c 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -2139,12 +2139,10 @@ bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) {
   bool Result = true;
 #ifndef NDEBUG
   if (SeedAllowList.size() != 0)
-    Result =
-        std::count(SeedAllowList.begin(), SeedAllowList.end(), AA.getName());
+    Result = llvm::is_contained(SeedAllowList, AA.getName());
   Function *Fn = AA.getAnchorScope();
   if (FunctionSeedAllowList.size() != 0 && Fn)
-    Result &= std::count(FunctionSeedAllowList.begin(),
-                         FunctionSeedAllowList.end(), Fn->getName());
+    Result &= llvm::is_contained(FunctionSeedAllowList, Fn->getName());
 #endif
   return Result;
 }
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index ec08287393de..b977821bcaa6 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -417,7 +417,7 @@ const Value *stripAndAccumulateMinimalOffsets(
                                                 AttributorAnalysis);
 }
 
-static const Value *getMinimalBaseOfAccsesPointerOperand(
+static const Value *getMinimalBaseOfAccessPointerOperand(
     Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I,
     int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) {
   const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
@@ -2129,7 +2129,7 @@ static int64_t getKnownNonNullAndDerefBytesForUse(
 
   int64_t Offset;
   const Value *Base =
-      getMinimalBaseOfAccsesPointerOperand(A, QueryingAA, I, Offset, DL);
+      getMinimalBaseOfAccessPointerOperand(A, QueryingAA, I, Offset, DL);
   if (Base) {
     if (Base == &AssociatedValue &&
         getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
@@ -6414,31 +6414,36 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
       return indicatePessimisticFixpoint();
     }
 
+    // Collect the types that will replace the privatizable type in the function
+    // signature.
+    SmallVector<Type *, 16> ReplacementTypes;
+    identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
+
     // Verify callee and caller agree on how the promoted argument would be
     // passed.
-    // TODO: The use of the ArgumentPromotion interface here is ugly, we need a
-    // specialized form of TargetTransformInfo::areFunctionArgsABICompatible
-    // which doesn't require the arguments ArgumentPromotion wanted to pass.
     Function &Fn = *getIRPosition().getAnchorScope();
-    SmallPtrSet<Argument *, 1> ArgsToPromote, Dummy;
-    ArgsToPromote.insert(getAssociatedArgument());
     const auto *TTI =
         A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(Fn);
-    if (!TTI ||
-        !ArgumentPromotionPass::areFunctionArgsABICompatible(
-            Fn, *TTI, ArgsToPromote, Dummy) ||
-        ArgsToPromote.empty()) {
+    if (!TTI) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Missing TTI for function "
+                        << Fn.getName() << "\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    auto CallSiteCheck = [&](AbstractCallSite ACS) {
+      CallBase *CB = ACS.getInstruction();
+      return TTI->areTypesABICompatible(
+          CB->getCaller(), CB->getCalledFunction(), ReplacementTypes);
+    };
+    bool AllCallSitesKnown;
+    if (!A.checkForAllCallSites(CallSiteCheck, *this, true,
+                                AllCallSitesKnown)) {
       LLVM_DEBUG(
           dbgs() << "[AAPrivatizablePtr] ABI incompatibility detected for "
                  << Fn.getName() << "\n");
       return indicatePessimisticFixpoint();
     }
 
-    // Collect the types that will replace the privatizable type in the function
-    // signature.
-    SmallVector<Type *, 16> ReplacementTypes;
-    identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
-
     // Register a rewrite of the argument.
     Argument *Arg = getAssociatedArgument();
     if (!A.isValidFunctionSignatureRewrite(*Arg, ReplacementTypes)) {
@@ -6558,7 +6563,6 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
       return false;
     };
 
-    bool AllCallSitesKnown;
     if (!A.checkForAllCallSites(IsCompatiblePrivArgOfOtherCallSite, *this, true,
                                 AllCallSitesKnown))
       return indicatePessimisticFixpoint();
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index cde78713b554..321d4a19a585 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -76,6 +76,7 @@ STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
 STATISTIC(NumReturned, "Number of arguments marked returned");
 STATISTIC(NumReadNoneArg, "Number of arguments marked readnone");
 STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
+STATISTIC(NumWriteOnlyArg, "Number of arguments marked writeonly");
 STATISTIC(NumNoAlias, "Number of function returns marked noalias");
 STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
 STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
@@ -580,16 +581,8 @@ struct ArgumentUsesTracker : public CaptureTracker {
       return true;
     }
 
-    // Note: the callee and the two successor blocks *follow* the argument
-    // operands.  This means there is no need to adjust UseIndex to account for
-    // these.
-
-    unsigned UseIndex =
-        std::distance(const_cast<const Use *>(CB->arg_begin()), U);
-
-    assert(UseIndex < CB->data_operands_size() &&
-           "Indirect function calls should have been filtered above!");
-
+    assert(!CB->isCallee(U) && "callee operand reported captured?");
+    const unsigned UseIndex = CB->getDataOperandNo(U);
     if (UseIndex >= CB->arg_size()) {
       // Data operand, but not a argument operand -- must be a bundle operand
       assert(CB->hasOperandBundles() && "Must be!");
@@ -649,8 +642,8 @@ struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> {
 
 /// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone.
 static Attribute::AttrKind
-determinePointerReadAttrs(Argument *A,
-                          const SmallPtrSet<Argument *, 8> &SCCNodes) {
+determinePointerAccessAttrs(Argument *A,
+                            const SmallPtrSet<Argument *, 8> &SCCNodes) {
   SmallVector<Use *, 32> Worklist;
   SmallPtrSet<Use *, 32> Visited;
 
@@ -659,7 +652,7 @@ determinePointerReadAttrs(Argument *A,
     return Attribute::None;
 
   bool IsRead = false;
-  // We don't need to track IsWritten. If A is written to, return immediately.
+  bool IsWrite = false;
 
   for (Use &U : A->uses()) {
     Visited.insert(&U);
@@ -667,6 +660,10 @@ determinePointerReadAttrs(Argument *A,
   }
 
   while (!Worklist.empty()) {
+    if (IsWrite && IsRead)
+      // No point in searching further..
+      return Attribute::None;
+
     Use *U = Worklist.pop_back_val();
     Instruction *I = cast<Instruction>(U->getUser());
 
@@ -684,73 +681,49 @@ determinePointerReadAttrs(Argument *A,
 
     case Instruction::Call:
     case Instruction::Invoke: {
-      bool Captures = true;
+      CallBase &CB = cast<CallBase>(*I);
+      if (CB.isCallee(U)) {
+        IsRead = true;
+        // Note that indirect calls do not capture, see comment in
+        // CaptureTracking for context
+        continue;
+      }
 
-      if (I->getType()->isVoidTy())
-        Captures = false;
+      // Given we've explictily handled the callee operand above, what's left
+      // must be a data operand (e.g. argument or operand bundle)
+      const unsigned UseIndex = CB.getDataOperandNo(U);
 
-      auto AddUsersToWorklistIfCapturing = [&] {
-        if (Captures)
+      if (!CB.doesNotCapture(UseIndex)) {
+        if (!CB.onlyReadsMemory())
+          // If the callee can save a copy into other memory, then simply
+          // scanning uses of the call is insufficient.  We have no way
+          // of tracking copies of the pointer through memory to see
+          // if a reloaded copy is written to, thus we must give up.
+          return Attribute::None;
+        // Push users for processing once we finish this one
+        if (!I->getType()->isVoidTy())
           for (Use &UU : I->uses())
             if (Visited.insert(&UU).second)
               Worklist.push_back(&UU);
-      };
-
-      CallBase &CB = cast<CallBase>(*I);
-      if (CB.doesNotAccessMemory()) {
-        AddUsersToWorklistIfCapturing();
-        continue;
       }
+      
+      if (CB.doesNotAccessMemory())
+        continue;
 
-      Function *F = CB.getCalledFunction();
-      if (!F) {
-        if (CB.onlyReadsMemory()) {
-          IsRead = true;
-          AddUsersToWorklistIfCapturing();
-          continue;
-        }
-        return Attribute::None;
-      }
-
-      // Note: the callee and the two successor blocks *follow* the argument
-      // operands.  This means there is no need to adjust UseIndex to account
-      // for these.
-
-      unsigned UseIndex = std::distance(CB.arg_begin(), U);
-
-      // U cannot be the callee operand use: since we're exploring the
-      // transitive uses of an Argument, having such a use be a callee would
-      // imply the call site is an indirect call or invoke; and we'd take the
-      // early exit above.
-      assert(UseIndex < CB.data_operands_size() &&
-             "Data operand use expected!");
-
-      bool IsOperandBundleUse = UseIndex >= CB.arg_size();
+      if (Function *F = CB.getCalledFunction())
+        if (CB.isArgOperand(U) && UseIndex < F->arg_size() &&
+            SCCNodes.count(F->getArg(UseIndex)))
+          // This is an argument which is part of the speculative SCC.  Note
+          // that only operands corresponding to formal arguments of the callee
+          // can participate in the speculation.
+          break;
 
-      if (UseIndex >= F->arg_size() && !IsOperandBundleUse) {
-        assert(F->isVarArg() && "More params than args in non-varargs call");
+      // The accessors used on call site here do the right thing for calls and
+      // invokes with operand bundles.
+      if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex))
         return Attribute::None;
-      }
-
-      Captures &= !CB.doesNotCapture(UseIndex);
-
-      // Since the optimizer (by design) cannot see the data flow corresponding
-      // to a operand bundle use, these cannot participate in the optimistic SCC
-      // analysis.  Instead, we model the operand bundle uses as arguments in
-      // call to a function external to the SCC.
-      if (IsOperandBundleUse ||
-          !SCCNodes.count(&*std::next(F->arg_begin(), UseIndex))) {
-
-        // The accessors used on call site here do the right thing for calls and
-        // invokes with operand bundles.
-
-        if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex))
-          return Attribute::None;
-        if (!CB.doesNotAccessMemory(UseIndex))
-          IsRead = true;
-      }
-
-      AddUsersToWorklistIfCapturing();
+      if (!CB.doesNotAccessMemory(UseIndex))
+        IsRead = true;
       break;
     }
 
@@ -763,6 +736,19 @@ determinePointerReadAttrs(Argument *A,
       IsRead = true;
       break;
 
+    case Instruction::Store:
+      if (cast<StoreInst>(I)->getValueOperand() == *U)
+        // untrackable capture
+        return Attribute::None;
+
+      // A volatile store has side effects beyond what writeonly can be relied
+      // upon.
+      if (cast<StoreInst>(I)->isVolatile())
+        return Attribute::None;
+
+      IsWrite = true;
+      break;
+
     case Instruction::ICmp:
     case Instruction::Ret:
       break;
@@ -772,7 +758,14 @@ determinePointerReadAttrs(Argument *A,
     }
   }
 
-  return IsRead ? Attribute::ReadOnly : Attribute::ReadNone;
+  if (IsWrite && IsRead)
+    return Attribute::None;
+  else if (IsRead)
+    return Attribute::ReadOnly;
+  else if (IsWrite)
+    return Attribute::WriteOnly;
+  else
+    return Attribute::ReadNone;
 }
 
 /// Deduce returned attributes for the SCC.
@@ -865,9 +858,10 @@ static bool addArgumentAttrsFromCallsites(Function &F) {
   return Changed;
 }
 
-static bool addReadAttr(Argument *A, Attribute::AttrKind R) {
-  assert((R == Attribute::ReadOnly || R == Attribute::ReadNone)
-         && "Must be a Read attribute.");
+static bool addAccessAttr(Argument *A, Attribute::AttrKind R) {
+  assert((R == Attribute::ReadOnly || R == Attribute::ReadNone ||
+          R == Attribute::WriteOnly)
+         && "Must be an access attribute.");
   assert(A && "Argument must not be null.");
 
   // If the argument already has the attribute, nothing needs to be done.
@@ -880,7 +874,12 @@ static bool addReadAttr(Argument *A, Attribute::AttrKind R) {
   A->removeAttr(Attribute::ReadOnly);
   A->removeAttr(Attribute::ReadNone);
   A->addAttr(R);
-  R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
+  if (R == Attribute::ReadOnly)
+    ++NumReadOnlyArg;
+  else if (R == Attribute::WriteOnly)
+    ++NumWriteOnlyArg;
+  else
+    ++NumReadNoneArg;
   return true;
 }
 
@@ -945,15 +944,15 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
         // Otherwise, it's captured. Don't bother doing SCC analysis on it.
       }
       if (!HasNonLocalUses && !A->onlyReadsMemory()) {
-        // Can we determine that it's readonly/readnone without doing an SCC?
-        // Note that we don't allow any calls at all here, or else our result
-        // will be dependent on the iteration order through the functions in the
-        // SCC.
+        // Can we determine that it's readonly/readnone/writeonly without doing
+        // an SCC? Note that we don't allow any calls at all here, or else our
+        // result will be dependent on the iteration order through the
+        // functions in the SCC.
         SmallPtrSet<Argument *, 8> Self;
         Self.insert(&*A);
-        Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
+        Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self);
         if (R != Attribute::None)
-          if (addReadAttr(A, R))
+          if (addAccessAttr(A, R))
             Changed.insert(F);
       }
     }
@@ -979,6 +978,13 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
         A->addAttr(Attribute::NoCapture);
         ++NumNoCapture;
         Changed.insert(A->getParent());
+
+        // Infer the access attributes given the new nocapture one
+        SmallPtrSet<Argument *, 8> Self;
+        Self.insert(&*A);
+        Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self);
+        if (R != Attribute::None)
+          addAccessAttr(A, R);
       }
       continue;
     }
@@ -1023,10 +1029,10 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
       Changed.insert(A->getParent());
     }
 
-    // We also want to compute readonly/readnone. With a small number of false
-    // negatives, we can assume that any pointer which is captured isn't going
-    // to be provably readonly or readnone, since by definition we can't
-    // analyze all uses of a captured pointer.
+    // We also want to compute readonly/readnone/writeonly. With a small number
+    // of false negatives, we can assume that any pointer which is captured
+    // isn't going to be provably readonly or readnone, since by definition
+    // we can't analyze all uses of a captured pointer.
     //
     // The false negatives happen when the pointer is captured by a function
     // that promises readonly/readnone behaviour on the pointer, then the
@@ -1034,24 +1040,28 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes,
     // Also, a readonly/readnone pointer may be returned, but returning a
     // pointer is capturing it.
 
-    Attribute::AttrKind ReadAttr = Attribute::ReadNone;
-    for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+    auto meetAccessAttr = [](Attribute::AttrKind A, Attribute::AttrKind B) {
+      if (A == B)
+        return A;
+      if (A == Attribute::ReadNone)
+        return B;
+      if (B == Attribute::ReadNone)
+        return A;
+      return Attribute::None;
+    };
+
+    Attribute::AttrKind AccessAttr = Attribute::ReadNone;
+    for (unsigned i = 0, e = ArgumentSCC.size();
+         i != e && AccessAttr != Attribute::None; ++i) {
       Argument *A = ArgumentSCC[i]->Definition;
-      Attribute::AttrKind K = determinePointerReadAttrs(A, ArgumentSCCNodes);
-      if (K == Attribute::ReadNone)
-        continue;
-      if (K == Attribute::ReadOnly) {
-        ReadAttr = Attribute::ReadOnly;
-        continue;
-      }
-      ReadAttr = K;
-      break;
+      Attribute::AttrKind K = determinePointerAccessAttrs(A, ArgumentSCCNodes);
+      AccessAttr = meetAccessAttr(AccessAttr, K);
     }
 
-    if (ReadAttr != Attribute::None) {
+    if (AccessAttr != Attribute::None) {
       for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
         Argument *A = ArgumentSCC[i]->Definition;
-        if (addReadAttr(A, ReadAttr))
+        if (addAccessAttr(A, AccessAttr))
           Changed.insert(A->getParent());
       }
     }
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index fbd083bb9bbf..2425646455bd 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -64,8 +64,8 @@ static cl::opt<unsigned> FuncSpecializationMaxIters(
     cl::desc("The maximum number of iterations function specialization is run"),
     cl::init(1));
 
-static cl::opt<unsigned> MaxConstantsThreshold(
-    "func-specialization-max-constants", cl::Hidden,
+static cl::opt<unsigned> MaxClonesThreshold(
+    "func-specialization-max-clones", cl::Hidden,
     cl::desc("The maximum number of clones allowed for a single function "
              "specialization"),
     cl::init(3));
@@ -92,6 +92,28 @@ static cl::opt<bool> EnableSpecializationForLiteralConstant(
     cl::desc("Enable specialization of functions that take a literal constant "
              "as an argument."));
 
+namespace {
+// Bookkeeping struct to pass data from the analysis and profitability phase
+// to the actual transform helper functions.
+struct ArgInfo {
+  Function *Fn;         // The function to perform specialisation on.
+  Argument *Arg;        // The Formal argument being analysed.
+  Constant *Const;      // A corresponding actual constant argument.
+  InstructionCost Gain; // Profitability: Gain = Bonus - Cost.
+
+  // Flag if this will be a partial specialization, in which case we will need
+  // to keep the original function around in addition to the added
+  // specializations.
+  bool Partial = false;
+
+  ArgInfo(Function *F, Argument *A, Constant *C, InstructionCost G)
+      : Fn(F), Arg(A), Const(C), Gain(G){};
+};
+} // Anonymous namespace
+
+using FuncList = SmallVectorImpl<Function *>;
+using ConstList = SmallVectorImpl<Constant *>;
+
 // Helper to check if \p LV is either a constant or a constant
 // range with a single element. This should cover exactly the same cases as the
 // old ValueLatticeElement::isConstant() and is intended to be used in the
@@ -169,7 +191,7 @@ static Constant *getConstantStackValue(CallInst *Call, Value *Val,
 //       ret void
 //     }
 //
-static void constantArgPropagation(SmallVectorImpl<Function *> &WorkList,
+static void constantArgPropagation(FuncList &WorkList,
                                    Module &M, SCCPSolver &Solver) {
   // Iterate over the argument tracked functions see if there
   // are any new constant values for the call instruction via
@@ -254,40 +276,33 @@ public:
   ///
   /// \returns true if at least one function is specialized.
   bool
-  specializeFunctions(SmallVectorImpl<Function *> &FuncDecls,
-                      SmallVectorImpl<Function *> &CurrentSpecializations) {
-
-    // Attempt to specialize the argument-tracked functions.
+  specializeFunctions(FuncList &FuncDecls,
+                      FuncList &CurrentSpecializations) {
     bool Changed = false;
     for (auto *F : FuncDecls) {
-      if (specializeFunction(F, CurrentSpecializations)) {
-        Changed = true;
-        LLVM_DEBUG(dbgs() << "FnSpecialization: Can specialize this func.\n");
-      } else {
+      if (!isCandidateFunction(F, CurrentSpecializations))
+        continue;
+
+      auto Cost = getSpecializationCost(F);
+      if (!Cost.isValid()) {
         LLVM_DEBUG(
-            dbgs() << "FnSpecialization: Cannot specialize this func.\n");
+            dbgs() << "FnSpecialization: Invalid specialisation cost.\n");
+        continue;
       }
-    }
 
-    for (auto *SpecializedFunc : CurrentSpecializations) {
-      SpecializedFuncs.insert(SpecializedFunc);
-
-      // Initialize the state of the newly created functions, marking them
-      // argument-tracked and executable.
-      if (SpecializedFunc->hasExactDefinition() &&
-          !SpecializedFunc->hasFnAttribute(Attribute::Naked))
-        Solver.addTrackedFunction(SpecializedFunc);
-      Solver.addArgumentTrackedFunction(SpecializedFunc);
-      FuncDecls.push_back(SpecializedFunc);
-      Solver.markBlockExecutable(&SpecializedFunc->front());
+      auto ConstArgs = calculateGains(F, Cost);
+      if (ConstArgs.empty()) {
+        LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n");
+        continue;
+      }
 
-      // Replace the function arguments for the specialized functions.
-      for (Argument &Arg : SpecializedFunc->args())
-        if (!Arg.use_empty() && tryToReplaceWithConstant(&Arg))
-          LLVM_DEBUG(dbgs() << "FnSpecialization: Replaced constant argument: "
-                            << Arg.getName() << "\n");
+      for (auto &CA : ConstArgs) {
+        specializeFunction(CA, CurrentSpecializations);
+        Changed = true;
+      }
     }
 
+    updateSpecializedFuncs(FuncDecls, CurrentSpecializations);
     NumFuncSpecialized += NbFunctionsSpecialized;
     return Changed;
   }
@@ -333,15 +348,83 @@ private:
     return Clone;
   }
 
-  /// This function decides whether to specialize function \p F based on the
-  /// known constant values its arguments can take on. Specialization is
-  /// performed on the first interesting argument. Specializations based on
-  /// additional arguments will be evaluated on following iterations of the
-  /// main IPSCCP solve loop. \returns true if the function is specialized and
-  /// false otherwise.
-  bool specializeFunction(Function *F,
-                          SmallVectorImpl<Function *> &Specializations) {
+  /// This function decides whether it's worthwhile to specialize function \p F
+  /// based on the known constant values its arguments can take on, i.e. it
+  /// calculates a gain and returns a list of actual arguments that are deemed
+  /// profitable to specialize. Specialization is performed on the first
+  /// interesting argument. Specializations based on additional arguments will
+  /// be evaluated on following iterations of the main IPSCCP solve loop.
+  SmallVector<ArgInfo> calculateGains(Function *F, InstructionCost Cost) {
+    SmallVector<ArgInfo> Worklist;
+    // Determine if we should specialize the function based on the values the
+    // argument can take on. If specialization is not profitable, we continue
+    // on to the next argument.
+    for (Argument &FormalArg : F->args()) {
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing arg: "
+                        << FormalArg.getName() << "\n");
+      // Determine if this argument is interesting. If we know the argument can
+      // take on any constant values, they are collected in Constants. If the
+      // argument can only ever equal a constant value in Constants, the
+      // function will be completely specialized, and the IsPartial flag will
+      // be set to false by isArgumentInteresting (that function only adds
+      // values to the Constants list that are deemed profitable).
+      bool IsPartial = true;
+      SmallVector<Constant *> ActualConstArg;
+      if (!isArgumentInteresting(&FormalArg, ActualConstArg, IsPartial)) {
+        LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n");
+        continue;
+      }
+
+      for (auto *ActualArg : ActualConstArg) {
+        InstructionCost Gain =
+            ForceFunctionSpecialization
+                ? 1
+                : getSpecializationBonus(&FormalArg, ActualArg) - Cost;
 
+        if (Gain <= 0)
+          continue;
+        Worklist.push_back({F, &FormalArg, ActualArg, Gain});
+      }
+
+      if (Worklist.empty())
+        continue;
+
+      // Sort the candidates in descending order.
+      llvm::stable_sort(Worklist, [](const ArgInfo &L, const ArgInfo &R) {
+        return L.Gain > R.Gain;
+      });
+
+      // Truncate the worklist to 'MaxClonesThreshold' candidates if
+      // necessary.
+      if (Worklist.size() > MaxClonesThreshold) {
+        LLVM_DEBUG(dbgs() << "FnSpecialization: number of candidates exceed "
+                    << "the maximum number of clones threshold.\n"
+                    << "Truncating worklist to " << MaxClonesThreshold
+                    << " candidates.\n");
+        Worklist.erase(Worklist.begin() + MaxClonesThreshold,
+                       Worklist.end());
+      }
+
+      if (IsPartial || Worklist.size() < ActualConstArg.size())
+        for (auto &ActualArg : Worklist)
+          ActualArg.Partial = true;
+
+      LLVM_DEBUG(dbgs() << "Sorted list of candidates by gain:\n";
+                 for (auto &C
+                      : Worklist) {
+                   dbgs() << "- Function = " << C.Fn->getName() << ", ";
+                   dbgs() << "FormalArg = " << C.Arg->getName() << ", ";
+                   dbgs() << "ActualArg = " << C.Const->getName() << ", ";
+                   dbgs() << "Gain = " << C.Gain << "\n";
+                 });
+
+      // FIXME: Only one argument per function.
+      break;
+    }
+    return Worklist;
+  }
+
+  bool isCandidateFunction(Function *F, FuncList &Specializations) {
     // Do not specialize the cloned function again.
     if (SpecializedFuncs.contains(F))
       return false;
@@ -362,84 +445,32 @@ private:
 
     LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName()
                       << "\n");
+    return true;
+  }
 
-    // Determine if it would be profitable to create a specialization of the
-    // function where the argument takes on the given constant value. If so,
-    // add the constant to Constants.
-    auto FnSpecCost = getSpecializationCost(F);
-    if (!FnSpecCost.isValid()) {
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Invalid specialisation cost.\n");
-      return false;
-    }
-
-    LLVM_DEBUG(dbgs() << "FnSpecialization: func specialisation cost: ";
-               FnSpecCost.print(dbgs()); dbgs() << "\n");
-
-    // Determine if we should specialize the function based on the values the
-    // argument can take on. If specialization is not profitable, we continue
-    // on to the next argument.
-    for (Argument &A : F->args()) {
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing arg: " << A.getName()
-                        << "\n");
-      // True if this will be a partial specialization. We will need to keep
-      // the original function around in addition to the added specializations.
-      bool IsPartial = true;
-
-      // Determine if this argument is interesting. If we know the argument can
-      // take on any constant values, they are collected in Constants. If the
-      // argument can only ever equal a constant value in Constants, the
-      // function will be completely specialized, and the IsPartial flag will
-      // be set to false by isArgumentInteresting (that function only adds
-      // values to the Constants list that are deemed profitable).
-      SmallVector<Constant *, 4> Constants;
-      if (!isArgumentInteresting(&A, Constants, FnSpecCost, IsPartial)) {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n");
-        continue;
-      }
-
-      assert(!Constants.empty() && "No constants on which to specialize");
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is interesting!\n"
-                        << "FnSpecialization: Specializing '" << F->getName()
-                        << "' on argument: " << A << "\n"
-                        << "FnSpecialization: Constants are:\n\n";
-                 for (unsigned I = 0; I < Constants.size(); ++I) dbgs()
-                 << *Constants[I] << "\n";
-                 dbgs() << "FnSpecialization: End of constants\n\n");
-
-      // Create a version of the function in which the argument is marked
-      // constant with the given value.
-      for (auto *C : Constants) {
-        // Clone the function. We leave the ValueToValueMap empty to allow
-        // IPSCCP to propagate the constant arguments.
-        Function *Clone = cloneCandidateFunction(F);
-        Argument *ClonedArg = Clone->arg_begin() + A.getArgNo();
-
-        // Rewrite calls to the function so that they call the clone instead.
-        rewriteCallSites(F, Clone, *ClonedArg, C);
+  void specializeFunction(ArgInfo &AI, FuncList &Specializations) {
+    Function *Clone = cloneCandidateFunction(AI.Fn);
+    Argument *ClonedArg = Clone->getArg(AI.Arg->getArgNo());
 
-        // Initialize the lattice state of the arguments of the function clone,
-        // marking the argument on which we specialized the function constant
-        // with the given value.
-        Solver.markArgInFuncSpecialization(F, ClonedArg, C);
+    // Rewrite calls to the function so that they call the clone instead.
+    rewriteCallSites(AI.Fn, Clone, *ClonedArg, AI.Const);
 
-        // Mark all the specialized functions
-        Specializations.push_back(Clone);
-        NbFunctionsSpecialized++;
-      }
+    // Initialize the lattice state of the arguments of the function clone,
+    // marking the argument on which we specialized the function constant
+    // with the given value.
+    Solver.markArgInFuncSpecialization(AI.Fn, ClonedArg, AI.Const);
 
-      // If the function has been completely specialized, the original function
-      // is no longer needed. Mark it unreachable.
-      if (!IsPartial)
-        Solver.markFunctionUnreachable(F);
+    // Mark all the specialized functions
+    Specializations.push_back(Clone);
+    NbFunctionsSpecialized++;
 
-      // FIXME: Only one argument per function.
-      return true;
-    }
-
-    return false;
+    // If the function has been completely specialized, the original function
+    // is no longer needed. Mark it unreachable.
+    if (!AI.Partial)
+      Solver.markFunctionUnreachable(AI.Fn);
   }
 
-  /// Compute the cost of specializing function \p F.
+  /// Compute and return the cost of specializing function \p F.
   InstructionCost getSpecializationCost(Function *F) {
     // Compute the code metrics for the function.
     SmallPtrSet<const Value *, 32> EphValues;
@@ -578,9 +609,7 @@ private:
   ///
   /// \returns true if the function should be specialized on the given
   /// argument.
-  bool isArgumentInteresting(Argument *A,
-                             SmallVectorImpl<Constant *> &Constants,
-                             const InstructionCost &FnSpecCost,
+  bool isArgumentInteresting(Argument *A, ConstList &Constants,
                              bool &IsPartial) {
     // For now, don't attempt to specialize functions based on the values of
     // composite types.
@@ -608,42 +637,8 @@ private:
     //
     // TODO 2: this currently does not support constants, i.e. integer ranges.
     //
-    SmallVector<Constant *, 4> PossibleConstants;
-    bool AllConstant = getPossibleConstants(A, PossibleConstants);
-    if (PossibleConstants.empty()) {
-      LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n");
-      return false;
-    }
-    if (PossibleConstants.size() > MaxConstantsThreshold) {
-      LLVM_DEBUG(dbgs() << "FnSpecialization: number of constants found exceed "
-                        << "the maximum number of constants threshold.\n");
-      return false;
-    }
-
-    for (auto *C : PossibleConstants) {
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Constant: " << *C << "\n");
-      if (ForceFunctionSpecialization) {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: Forced!\n");
-        Constants.push_back(C);
-        continue;
-      }
-      if (getSpecializationBonus(A, C) > FnSpecCost) {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: profitable!\n");
-        Constants.push_back(C);
-      } else {
-        LLVM_DEBUG(dbgs() << "FnSpecialization: not profitable\n");
-      }
-    }
-
-    // None of the constant values the argument can take on were deemed good
-    // candidates on which to specialize the function.
-    if (Constants.empty())
-      return false;
-
-    // This will be a partial specialization if some of the constants were
-    // rejected due to their profitability.
-    IsPartial = !AllConstant || PossibleConstants.size() != Constants.size();
-
+    IsPartial = !getPossibleConstants(A, Constants);
+    LLVM_DEBUG(dbgs() << "FnSpecialization: interesting arg: " << *A << "\n");
     return true;
   }
 
@@ -653,8 +648,7 @@ private:
   /// \returns true if all of the values the argument can take on are constant
   /// (e.g., the argument's parent function cannot be called with an
   /// overdefined value).
-  bool getPossibleConstants(Argument *A,
-                            SmallVectorImpl<Constant *> &Constants) {
+  bool getPossibleConstants(Argument *A, ConstList &Constants) {
     Function *F = A->getParent();
     bool AllConstant = true;
 
@@ -681,7 +675,7 @@ private:
 
       // For now, constant expressions are fine but only if they are function
       // calls.
-      if (auto *CE =  dyn_cast<ConstantExpr>(V))
+      if (auto *CE = dyn_cast<ConstantExpr>(V))
         if (!isa<Function>(CE->getOperand(0)))
           return false;
 
@@ -737,6 +731,29 @@ private:
       }
     }
   }
+
+  void updateSpecializedFuncs(FuncList &FuncDecls,
+                              FuncList &CurrentSpecializations) {
+    for (auto *SpecializedFunc : CurrentSpecializations) {
+      SpecializedFuncs.insert(SpecializedFunc);
+
+      // Initialize the state of the newly created functions, marking them
+      // argument-tracked and executable.
+      if (SpecializedFunc->hasExactDefinition() &&
+          !SpecializedFunc->hasFnAttribute(Attribute::Naked))
+        Solver.addTrackedFunction(SpecializedFunc);
+
+      Solver.addArgumentTrackedFunction(SpecializedFunc);
+      FuncDecls.push_back(SpecializedFunc);
+      Solver.markBlockExecutable(&SpecializedFunc->front());
+
+      // Replace the function arguments for the specialized functions.
+      for (Argument &Arg : SpecializedFunc->args())
+        if (!Arg.use_empty() && tryToReplaceWithConstant(&Arg))
+          LLVM_DEBUG(dbgs() << "FnSpecialization: Replaced constant argument: "
+                            << Arg.getName() << "\n");
+    }
+  }
 };
 } // namespace
 
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index ba7589c2bf60..b1f3ff15c97b 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -305,8 +305,9 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
     else if (auto *LI = dyn_cast<LoadInst>(U)) {
       // A load from zeroinitializer is always zeroinitializer, regardless of
       // any applied offset.
-      if (Init->isNullValue()) {
-        LI->replaceAllUsesWith(Constant::getNullValue(LI->getType()));
+      Type *Ty = LI->getType();
+      if (Init->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) {
+        LI->replaceAllUsesWith(Constant::getNullValue(Ty));
         EraseFromParent(LI);
         continue;
       }
@@ -316,8 +317,7 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
       PtrOp = PtrOp->stripAndAccumulateConstantOffsets(
           DL, Offset, /* AllowNonInbounds */ true);
       if (PtrOp == GV) {
-        if (auto *Value = ConstantFoldLoadFromConst(Init, LI->getType(),
-                                                    Offset, DL)) {
+        if (auto *Value = ConstantFoldLoadFromConst(Init, Ty, Offset, DL)) {
           LI->replaceAllUsesWith(Value);
           EraseFromParent(LI);
         }
@@ -368,8 +368,7 @@ static bool isSafeSROAGEP(User *U) {
       return false;
   }
 
-  return llvm::all_of(U->users(),
-                      [](User *UU) { return isSafeSROAElementUse(UU); });
+  return llvm::all_of(U->users(), isSafeSROAElementUse);
 }
 
 /// Return true if the specified instruction is a safe user of a derived
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index 833049d6896f..a964fcde0396 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -294,7 +294,7 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
       // Find all incoming values from the outlining region.
       int NumIncomingVals = 0;
       for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
-        if (find(Region, PN.getIncomingBlock(i)) != Region.end()) {
+        if (llvm::is_contained(Region, PN.getIncomingBlock(i))) {
           ++NumIncomingVals;
           if (NumIncomingVals > 1) {
             ++NumSplitExitPhis;
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 992c2b292e1e..4e3689f09536 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -856,6 +856,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
 
       if (InlineHistoryID != -1 &&
           inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) {
+        LLVM_DEBUG(dbgs() << "Skipping inlining due to history: "
+                          << F.getName() << " -> " << Callee.getName() << "\n");
         setInlineRemark(*CB, "recursive");
         continue;
       }
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index f78971f0e586..c0bb19e184d6 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1774,8 +1774,9 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
                                           bool IsJumpTableCanonical) {
   SmallSetVector<Constant *, 4> Constants;
   for (Use &U : llvm::make_early_inc_range(Old->uses())) {
-    // Skip block addresses
-    if (isa<BlockAddress>(U.getUser()))
+    // Skip block addresses and no_cfi values, which refer to the function
+    // body instead of the jump table.
+    if (isa<BlockAddress, NoCFIValue>(U.getUser()))
       continue;
 
     // Skip direct calls to externally defined or non-dso_local functions
@@ -1802,7 +1803,7 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
 }
 
 void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) {
-  Old->replaceUsesWithIf(New, [](Use &U) { return isDirectCall(U); });
+  Old->replaceUsesWithIf(New, isDirectCall);
 }
 
 bool LowerTypeTestsModule::lower() {
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 055ee6b50296..f289e3ecc979 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -3964,6 +3964,9 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     case OMPRTL___kmpc_master:
     case OMPRTL___kmpc_end_master:
     case OMPRTL___kmpc_barrier:
+    case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
+    case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
+    case OMPRTL___kmpc_nvptx_end_reduce_nowait:
       break;
     case OMPRTL___kmpc_distribute_static_init_4:
     case OMPRTL___kmpc_distribute_static_init_4u:
@@ -4010,6 +4013,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       break;
     case OMPRTL___kmpc_omp_task:
       // We do not look into tasks right now, just give up.
+      SPMDCompatibilityTracker.indicatePessimisticFixpoint();
       SPMDCompatibilityTracker.insert(&CB);
       ReachedUnknownParallelRegions.insert(&CB);
       break;
@@ -4020,6 +4024,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     default:
       // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
       // generally. However, they do not hide parallel regions.
+      SPMDCompatibilityTracker.indicatePessimisticFixpoint();
       SPMDCompatibilityTracker.insert(&CB);
       break;
     }
@@ -4079,6 +4084,7 @@ struct AAKernelInfoCallSite : AAKernelInfo {
         SPMDCompatibilityTracker.insert(&CB);
       break;
     default:
+      SPMDCompatibilityTracker.indicatePessimisticFixpoint();
       SPMDCompatibilityTracker.insert(&CB);
     }
 
diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
index bae9a1e27e75..7334bf695b67 100644
--- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -32,7 +32,7 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite,
   if (CalleeName.empty())
     return getHottestChildContext(CallSite);
 
-  uint64_t Hash = nodeHash(CalleeName, CallSite);
+  uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite);
   auto It = AllChildContext.find(Hash);
   if (It != AllChildContext.end())
     return &It->second;
@@ -65,7 +65,8 @@ ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) {
 ContextTrieNode &ContextTrieNode::moveToChildContext(
     const LineLocation &CallSite, ContextTrieNode &&NodeToMove,
     uint32_t ContextFramesToRemove, bool DeleteNode) {
-  uint64_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite);
+  uint64_t Hash =
+      FunctionSamples::getCallSiteHash(NodeToMove.getFuncName(), CallSite);
   assert(!AllChildContext.count(Hash) && "Node to remove must exist");
   LineLocation OldCallSite = NodeToMove.CallSiteLoc;
   ContextTrieNode &OldParentContext = *NodeToMove.getParentContext();
@@ -108,7 +109,7 @@ ContextTrieNode &ContextTrieNode::moveToChildContext(
 
 void ContextTrieNode::removeChildContext(const LineLocation &CallSite,
                                          StringRef CalleeName) {
-  uint64_t Hash = nodeHash(CalleeName, CallSite);
+  uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite);
   // Note this essentially calls dtor and destroys that child context
   AllChildContext.erase(Hash);
 }
@@ -174,21 +175,9 @@ void ContextTrieNode::dumpTree() {
   }
 }
 
-uint64_t ContextTrieNode::nodeHash(StringRef ChildName,
-                                   const LineLocation &Callsite) {
-  // We still use child's name for child hash, this is
-  // because for children of root node, we don't have
-  // different line/discriminator, and we'll rely on name
-  // to differentiate children.
-  uint64_t NameHash = std::hash<std::string>{}(ChildName.str());
-  uint64_t LocId =
-      (((uint64_t)Callsite.LineOffset) << 32) | Callsite.Discriminator;
-  return NameHash + (LocId << 5) + LocId;
-}
-
 ContextTrieNode *ContextTrieNode::getOrCreateChildContext(
     const LineLocation &CallSite, StringRef CalleeName, bool AllowCreate) {
-  uint64_t Hash = nodeHash(CalleeName, CallSite);
+  uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite);
   auto It = AllChildContext.find(Hash);
   if (It != AllChildContext.end()) {
     assert(It->second.getFuncName() == CalleeName &&
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index b8fac9d47763..bc6051de90c4 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -467,6 +467,9 @@ protected:
   void emitOptimizationRemarksForInlineCandidates(
       const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
       bool Hot);
+  void promoteMergeNotInlinedContextSamples(
+      DenseMap<CallBase *, const FunctionSamples *> NonInlinedCallSites,
+      const Function &F);
   std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
   std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(CallGraph &CG);
   void generateMDProfMetadata(Function &F);
@@ -485,7 +488,7 @@ protected:
   std::unique_ptr<SampleContextTracker> ContextTracker;
 
   /// Flag indicating whether input profile is context-sensitive
-  bool ProfileIsCS = false;
+  bool ProfileIsCSFlat = false;
 
   /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
   ///
@@ -602,7 +605,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   // call instruction should have 0 count.
   // For CS profile, the callsite count of previously inlined callees is
   // populated with the entry count of the callees.
-  if (!ProfileIsCS)
+  if (!ProfileIsCSFlat)
     if (const auto *CB = dyn_cast<CallBase>(&Inst))
       if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
         return 0;
@@ -641,7 +644,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
   // call instruction should have 0 count.
   // For CS profile, the callsite count of previously inlined callees is
   // populated with the entry count of the callees.
-  if (!ProfileIsCS)
+  if (!ProfileIsCSFlat)
     if (const auto *CB = dyn_cast<CallBase>(&Inst))
       if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
         return 0;
@@ -695,7 +698,7 @@ SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
   if (Function *Callee = Inst.getCalledFunction())
     CalleeName = Callee->getName();
 
-  if (ProfileIsCS)
+  if (ProfileIsCSFlat)
     return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
 
   const FunctionSamples *FS = findFunctionSamples(Inst);
@@ -727,7 +730,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
            FunctionSamples::getGUID(R->getName());
   };
 
-  if (ProfileIsCS) {
+  if (ProfileIsCSFlat) {
     auto CalleeSamples =
         ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
     if (CalleeSamples.empty())
@@ -780,7 +783,7 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
 
   auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
   if (it.second) {
-    if (ProfileIsCS)
+    if (ProfileIsCSFlat)
       it.first->second = ContextTracker->getContextSamplesFor(DIL);
     else
       it.first->second =
@@ -1039,7 +1042,7 @@ void SampleProfileLoader::findExternalInlineCandidate(
 
   // For AutoFDO profile, retrieve candidate profiles by walking over
   // the nested inlinee profiles.
-  if (!ProfileIsCS) {
+  if (!ProfileIsCSFlat) {
     Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
     return;
   }
@@ -1134,7 +1137,7 @@ bool SampleProfileLoader::inlineHotFunctions(
               assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
                      "GUIDToFuncNameMap has to be populated");
               AllCandidates.push_back(CB);
-              if (FS->getEntrySamples() > 0 || ProfileIsCS)
+              if (FS->getEntrySamples() > 0 || ProfileIsCSFlat)
                 LocalNotInlinedCallSites.try_emplace(CB, FS);
               if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
                 Hot = true;
@@ -1156,11 +1159,9 @@ bool SampleProfileLoader::inlineHotFunctions(
     }
     for (CallBase *I : CIS) {
       Function *CalledFunction = I->getCalledFunction();
-      InlineCandidate Candidate = {
-          I,
-          LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I]
-                                            : nullptr,
-          0 /* dummy count */, 1.0 /* dummy distribution factor */};
+      InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),
+                                   0 /* dummy count */,
+                                   1.0 /* dummy distribution factor */};
       // Do not inline recursive calls.
       if (CalledFunction == &F)
         continue;
@@ -1198,53 +1199,9 @@ bool SampleProfileLoader::inlineHotFunctions(
   }
 
   // For CS profile, profile for not inlined context will be merged when
-  // base profile is being trieved
-  if (ProfileIsCS)
-    return Changed;
-
-  // Accumulate not inlined callsite information into notInlinedSamples
-  for (const auto &Pair : LocalNotInlinedCallSites) {
-    CallBase *I = Pair.getFirst();
-    Function *Callee = I->getCalledFunction();
-    if (!Callee || Callee->isDeclaration())
-      continue;
-
-    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
-                                         I->getDebugLoc(), I->getParent())
-              << "previous inlining not repeated: '"
-              << ore::NV("Callee", Callee) << "' into '"
-              << ore::NV("Caller", &F) << "'");
-
-    ++NumCSNotInlined;
-    const FunctionSamples *FS = Pair.getSecond();
-    if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
-      continue;
-    }
-
-    if (ProfileMergeInlinee) {
-      // A function call can be replicated by optimizations like callsite
-      // splitting or jump threading and the replicates end up sharing the
-      // sample nested callee profile instead of slicing the original inlinee's
-      // profile. We want to do merge exactly once by filtering out callee
-      // profiles with a non-zero head sample count.
-      if (FS->getHeadSamples() == 0) {
-        // Use entry samples as head samples during the merge, as inlinees
-        // don't have head samples.
-        const_cast<FunctionSamples *>(FS)->addHeadSamples(
-            FS->getEntrySamples());
-
-        // Note that we have to do the merge right after processing function.
-        // This allows OutlineFS's profile to be used for annotation during
-        // top-down processing of functions' annotation.
-        FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
-        OutlineFS->merge(*FS);
-      }
-    } else {
-      auto pair =
-          notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
-      pair.first->second.entryCount += FS->getEntrySamples();
-    }
-  }
+  // base profile is being retrieved.
+  if (!FunctionSamples::ProfileIsCSFlat)
+    promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
   return Changed;
 }
 
@@ -1285,7 +1242,7 @@ bool SampleProfileLoader::tryInlineCandidate(
         InlinedCallSites->push_back(I);
     }
 
-    if (ProfileIsCS)
+    if (ProfileIsCSFlat)
       ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
     ++NumCSInlined;
 
@@ -1430,7 +1387,6 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
 
 bool SampleProfileLoader::inlineHotFunctionsWithPriority(
     Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
-  assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now");
 
   // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
   // Profile symbol list is ignored when profile-sample-accurate is on.
@@ -1467,6 +1423,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
   if (ExternalInlineAdvisor)
     SizeLimit = std::numeric_limits<unsigned>::max();
 
+  DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
+
   // Perform iterative BFS call site prioritized inlining
   bool Changed = false;
   while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
@@ -1521,6 +1479,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
           }
           ICPCount++;
           Changed = true;
+        } else if (!ContextTracker) {
+          LocalNotInlinedCallSites.try_emplace(I, FS);
         }
       }
     } else if (CalledFunction && CalledFunction->getSubprogram() &&
@@ -1532,6 +1492,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
             CQueue.emplace(NewCandidate);
         }
         Changed = true;
+      } else if (!ContextTracker) {
+        LocalNotInlinedCallSites.try_emplace(I, Candidate.CalleeSamples);
       }
     } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
       findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
@@ -1549,9 +1511,63 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
       ++NumCSInlinedHitGrowthLimit;
   }
 
+  // For CS profile, profile for not inlined context will be merged when
+  // base profile is being retrieved.
+  if (!FunctionSamples::ProfileIsCSFlat)
+    promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
   return Changed;
 }
 
+void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
+    DenseMap<CallBase *, const FunctionSamples *> NonInlinedCallSites,
+    const Function &F) {
+  // Accumulate not inlined callsite information into notInlinedSamples
+  for (const auto &Pair : NonInlinedCallSites) {
+    CallBase *I = Pair.getFirst();
+    Function *Callee = I->getCalledFunction();
+    if (!Callee || Callee->isDeclaration())
+      continue;
+
+    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
+                                         I->getDebugLoc(), I->getParent())
+              << "previous inlining not repeated: '"
+              << ore::NV("Callee", Callee) << "' into '"
+              << ore::NV("Caller", &F) << "'");
+
+    ++NumCSNotInlined;
+    const FunctionSamples *FS = Pair.getSecond();
+    if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
+      continue;
+    }
+
+    if (ProfileMergeInlinee) {
+      // A function call can be replicated by optimizations like callsite
+      // splitting or jump threading and the replicates end up sharing the
+      // sample nested callee profile instead of slicing the original
+      // inlinee's profile. We want to do merge exactly once by filtering out
+      // callee profiles with a non-zero head sample count.
+      if (FS->getHeadSamples() == 0) {
+        // Use entry samples as head samples during the merge, as inlinees
+        // don't have head samples.
+        const_cast<FunctionSamples *>(FS)->addHeadSamples(
+            FS->getEntrySamples());
+
+        // Note that we have to do the merge right after processing function.
+        // This allows OutlineFS's profile to be used for annotation during
+        // top-down processing of functions' annotation.
+        FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
+        OutlineFS->merge(*FS, 1);
+        // Set outlined profile to be synthetic to not bias the inliner.
+        OutlineFS->SetContextSynthetic();
+      }
+    } else {
+      auto pair =
+          notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
+      pair.first->second.entryCount += FS->getEntrySamples();
+    }
+  }
+}
+
 /// Returns the sorted CallTargetMap \p M by count in descending order.
 static SmallVector<InstrProfValueData, 2>
 GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) {
@@ -1607,7 +1623,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
           // With CSSPGO all indirect call targets are counted torwards the
           // original indirect call site in the profile, including both
           // inlined and non-inlined targets.
-          if (!FunctionSamples::ProfileIsCS) {
+          if (!FunctionSamples::ProfileIsCSFlat) {
             if (const FunctionSamplesMap *M =
                     FS->findFunctionSamplesMapAt(CallSite)) {
               for (const auto &NameFS : *M)
@@ -1754,7 +1770,7 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   }
 
   DenseSet<GlobalValue::GUID> InlinedGUIDs;
-  if (ProfileIsCS && CallsitePrioritizedInline)
+  if (CallsitePrioritizedInline)
     Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
   else
     Changed |= inlineHotFunctions(F, InlinedGUIDs);
@@ -1782,7 +1798,7 @@ INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
 std::unique_ptr<ProfiledCallGraph>
 SampleProfileLoader::buildProfiledCallGraph(CallGraph &CG) {
   std::unique_ptr<ProfiledCallGraph> ProfiledCG;
-  if (ProfileIsCS)
+  if (ProfileIsCSFlat)
     ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
   else
     ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
@@ -1828,7 +1844,7 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
   assert(&CG->getModule() == &M);
 
   if (UseProfiledCallGraph ||
-      (ProfileIsCS && !UseProfiledCallGraph.getNumOccurrences())) {
+      (ProfileIsCSFlat && !UseProfiledCallGraph.getNumOccurrences())) {
     // Use profiled call edges to augment the top-down order. There are cases
     // that the top-down order computed based on the static call graph doesn't
     // reflect real execution order. For example
@@ -1961,10 +1977,8 @@ bool SampleProfileLoader::doInitialization(Module &M,
   }
 
   // Apply tweaks if context-sensitive profile is available.
-  if (Reader->profileIsCS()) {
-    ProfileIsCS = true;
-    FunctionSamples::ProfileIsCS = true;
-
+  if (Reader->profileIsCSFlat() || Reader->profileIsCSNested()) {
+    ProfileIsCSFlat = Reader->profileIsCSFlat();
     // Enable priority-base inliner and size inline by default for CSSPGO.
     if (!ProfileSizeInline.getNumOccurrences())
       ProfileSizeInline = true;
@@ -1982,10 +1996,15 @@ bool SampleProfileLoader::doInitialization(Module &M,
     // Enable iterative-BFI by default for CSSPGO.
     if (!UseIterativeBFIInference.getNumOccurrences())
       UseIterativeBFIInference = true;
+    // Enable Profi by default for CSSPGO.
+    if (!SampleProfileUseProfi.getNumOccurrences())
+      SampleProfileUseProfi = true;
 
-    // Tracker for profiles under different context
-    ContextTracker = std::make_unique<SampleContextTracker>(
-        Reader->getProfiles(), &GUIDToFuncNameMap);
+    if (FunctionSamples::ProfileIsCSFlat) {
+      // Tracker for profiles under different context
+      ContextTracker = std::make_unique<SampleContextTracker>(
+          Reader->getProfiles(), &GUIDToFuncNameMap);
+    }
   }
 
   // Load pseudo probe descriptors for probe-based function samples.
@@ -1994,7 +2013,8 @@ bool SampleProfileLoader::doInitialization(Module &M,
     if (!ProbeManager->moduleIsProbed(M)) {
       const char *Msg =
           "Pseudo-probe-based profile requires SampleProfileProbePass";
-      Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+      Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,
+                                               DS_Warning));
       return false;
     }
   }
@@ -2062,7 +2082,7 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
   }
 
   // Account for cold calls not inlined....
-  if (!ProfileIsCS)
+  if (!ProfileIsCSFlat)
     for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
          notInlinedCallInfo)
       updateProfileCallee(pair.first, pair.second.entryCount);
@@ -2138,7 +2158,7 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM)
     ORE = OwnedORE.get();
   }
 
-  if (ProfileIsCS)
+  if (ProfileIsCSFlat)
     Samples = ContextTracker->getBaseSamplesFor(F);
   else
     Samples = Reader->getSamplesFor(F);
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 0cc1b37844f6..daaf6cbeb3fd 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -87,7 +87,8 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
     if (isa<Function>(&ExportGV) && allowPromotionAlias(OldName)) {
       // Create a local alias with the original name to avoid breaking
       // references from inline assembly.
-      std::string Alias = ".set " + OldName + "," + NewName + "\n";
+      std::string Alias =
+          ".lto_set_conditional " + OldName + "," + NewName + "\n";
       ExportM.appendModuleInlineAsm(Alias);
     }
   }
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 61054e7ae46f..6acace1d9fd4 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -359,6 +359,36 @@ template <> struct DenseMapInfo<VTableSlotSummary> {
 
 namespace {
 
+// Returns true if the function must be unreachable based on ValueInfo.
+//
+// In particular, identifies a function as unreachable in the following
+// conditions
+//   1) All summaries are live.
+//   2) All function summaries indicate it's unreachable
+bool mustBeUnreachableFunction(ValueInfo TheFnVI) {
+  if ((!TheFnVI) || TheFnVI.getSummaryList().empty()) {
+    // Returns false if ValueInfo is absent, or the summary list is empty
+    // (e.g., function declarations).
+    return false;
+  }
+
+  for (auto &Summary : TheFnVI.getSummaryList()) {
+    // Conservatively returns false if any non-live functions are seen.
+    // In general either all summaries should be live or all should be dead.
+    if (!Summary->isLive())
+      return false;
+    if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) {
+      if (!FS->fflags().MustBeUnreachable)
+        return false;
+    }
+    // Do nothing if a non-function has the same GUID (which is rare).
+    // This is correct since non-function summaries are not relevant.
+  }
+  // All function summaries are live and all of them agree that the function is
+  // unreachble.
+  return true;
+}
+
 // A virtual call site. VTable is the loaded virtual table pointer, and CS is
 // the indirect virtual call.
 struct VirtualCallSite {
@@ -562,10 +592,12 @@ struct DevirtModule {
   void buildTypeIdentifierMap(
       std::vector<VTableBits> &Bits,
       DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
+
   bool
   tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
                             const std::set<TypeMemberInfo> &TypeMemberInfos,
-                            uint64_t ByteOffset);
+                            uint64_t ByteOffset,
+                            ModuleSummaryIndex *ExportSummary);
 
   void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn,
                              bool &IsExported);
@@ -640,6 +672,23 @@ struct DevirtModule {
 
   bool run();
 
+  // Look up the corresponding ValueInfo entry of `TheFn` in `ExportSummary`.
+  //
+  // Caller guarantees that `ExportSummary` is not nullptr.
+  static ValueInfo lookUpFunctionValueInfo(Function *TheFn,
+                                           ModuleSummaryIndex *ExportSummary);
+
+  // Returns true if the function definition must be unreachable.
+  //
+  // Note if this helper function returns true, `F` is guaranteed
+  // to be unreachable; if it returns false, `F` might still
+  // be unreachable but not covered by this helper function.
+  //
+  // Implementation-wise, if function definition is present, IR is analyzed; if
+  // not, look up function flags from ExportSummary as a fallback.
+  static bool mustBeUnreachableFunction(Function *const F,
+                                        ModuleSummaryIndex *ExportSummary);
+
   // Lower the module using the action and summary passed as command line
   // arguments. For testing purposes only.
   static bool
@@ -969,7 +1018,8 @@ void DevirtModule::buildTypeIdentifierMap(
 
 bool DevirtModule::tryFindVirtualCallTargets(
     std::vector<VirtualCallTarget> &TargetsForSlot,
-    const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) {
+    const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset,
+    ModuleSummaryIndex *ExportSummary) {
   for (const TypeMemberInfo &TM : TypeMemberInfos) {
     if (!TM.Bits->GV->isConstant())
       return false;
@@ -997,6 +1047,11 @@ bool DevirtModule::tryFindVirtualCallTargets(
     if (Fn->getName() == "__cxa_pure_virtual")
       continue;
 
+    // We can disregard unreachable functions as possible call targets, as
+    // unreachable functions shouldn't be called.
+    if (mustBeUnreachableFunction(Fn, ExportSummary))
+      continue;
+
     TargetsForSlot.push_back({Fn, &TM});
   }
 
@@ -1053,6 +1108,9 @@ bool DevirtIndex::tryFindVirtualCallTargets(
       if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset)
         continue;
 
+      if (mustBeUnreachableFunction(VTP.FuncVI))
+        continue;
+
       TargetsForSlot.push_back(VTP.FuncVI);
     }
   }
@@ -1744,7 +1802,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
                          GlobalVariable::PrivateLinkage, NewInit, "", B.GV);
   NewGV->setSection(B.GV->getSection());
   NewGV->setComdat(B.GV->getComdat());
-  NewGV->setAlignment(MaybeAlign(B.GV->getAlignment()));
+  NewGV->setAlignment(B.GV->getAlign());
 
   // Copy the original vtable's metadata to the anonymous global, adjusting
   // offsets as required.
@@ -2014,6 +2072,44 @@ void DevirtModule::removeRedundantTypeTests() {
   }
 }
 
+ValueInfo
+DevirtModule::lookUpFunctionValueInfo(Function *TheFn,
+                                      ModuleSummaryIndex *ExportSummary) {
+  assert((ExportSummary != nullptr) &&
+         "Caller guarantees ExportSummary is not nullptr");
+
+  const auto TheFnGUID = TheFn->getGUID();
+  const auto TheFnGUIDWithExportedName = GlobalValue::getGUID(TheFn->getName());
+  // Look up ValueInfo with the GUID in the current linkage.
+  ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFnGUID);
+  // If no entry is found and GUID is different from GUID computed using
+  // exported name, look up ValueInfo with the exported name unconditionally.
+  // This is a fallback.
+  //
+  // The reason to have a fallback:
+  // 1. LTO could enable global value internalization via
+  // `enable-lto-internalization`.
+  // 2. The GUID in ExportedSummary is computed using exported name.
+  if ((!TheFnVI) && (TheFnGUID != TheFnGUIDWithExportedName)) {
+    TheFnVI = ExportSummary->getValueInfo(TheFnGUIDWithExportedName);
+  }
+  return TheFnVI;
+}
+
+bool DevirtModule::mustBeUnreachableFunction(
+    Function *const F, ModuleSummaryIndex *ExportSummary) {
+  // First, learn unreachability by analyzing function IR.
+  if (!F->isDeclaration()) {
+    // A function must be unreachable if its entry block ends with an
+    // 'unreachable'.
+    return isa<UnreachableInst>(F->getEntryBlock().getTerminator());
+  }
+  // Learn unreachability from ExportSummary if ExportSummary is present.
+  return ExportSummary &&
+         ::mustBeUnreachableFunction(
+             DevirtModule::lookUpFunctionValueInfo(F, ExportSummary));
+}
+
 bool DevirtModule::run() {
   // If only some of the modules were split, we cannot correctly perform
   // this transformation. We already checked for the presense of type tests
@@ -2137,7 +2233,7 @@ bool DevirtModule::run() {
                      cast<MDString>(S.first.TypeID)->getString())
                  .WPDRes[S.first.ByteOffset];
     if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos,
-                                  S.first.ByteOffset)) {
+                                  S.first.ByteOffset, ExportSummary)) {
 
       if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
         DidVirtualConstProp |=
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index dc55b5a31596..de1034c910d5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1795,6 +1795,55 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
     }
   }
 
+  // (~A & B & C) | ... --> ...
+  // (~A | B | C) | ... --> ...
+  // TODO: One use checks are conservative. We just need to check that a total
+  //       number of multiple used values does not exceed reduction
+  //       in operations.
+  if (match(Op0,
+            m_OneUse(m_c_BinOp(FlippedOpcode,
+                               m_BinOp(FlippedOpcode, m_Value(B), m_Value(C)),
+                               m_CombineAnd(m_Value(X), m_Not(m_Value(A)))))) ||
+      match(Op0, m_OneUse(m_c_BinOp(
+                     FlippedOpcode,
+                     m_c_BinOp(FlippedOpcode, m_Value(C),
+                               m_CombineAnd(m_Value(X), m_Not(m_Value(A)))),
+                     m_Value(B))))) {
+    // X = ~A
+    // (~A & B & C) | ~(A | B | C) --> ~(A | (B ^ C))
+    // (~A | B | C) & ~(A & B & C) --> (~A | (B ^ C))
+    if (match(Op1, m_OneUse(m_Not(m_c_BinOp(
+                       Opcode, m_c_BinOp(Opcode, m_Specific(A), m_Specific(B)),
+                       m_Specific(C))))) ||
+        match(Op1, m_OneUse(m_Not(m_c_BinOp(
+                       Opcode, m_c_BinOp(Opcode, m_Specific(B), m_Specific(C)),
+                       m_Specific(A))))) ||
+        match(Op1, m_OneUse(m_Not(m_c_BinOp(
+                       Opcode, m_c_BinOp(Opcode, m_Specific(A), m_Specific(C)),
+                       m_Specific(B)))))) {
+      Value *Xor = Builder.CreateXor(B, C);
+      return (Opcode == Instruction::Or)
+                 ? BinaryOperator::CreateNot(Builder.CreateOr(Xor, A))
+                 : BinaryOperator::CreateOr(Xor, X);
+    }
+
+    // (~A & B & C) | ~(A | B) --> (C | ~B) & ~A
+    // (~A | B | C) & ~(A & B) --> (C & ~B) | ~A
+    if (match(Op1, m_OneUse(m_Not(m_OneUse(
+                       m_c_BinOp(Opcode, m_Specific(A), m_Specific(B)))))))
+      return BinaryOperator::Create(
+          FlippedOpcode, Builder.CreateBinOp(Opcode, C, Builder.CreateNot(B)),
+          X);
+
+    // (~A & B & C) | ~(A | C) --> (B | ~C) & ~A
+    // (~A | B | C) & ~(A & C) --> (B & ~C) | ~A
+    if (match(Op1, m_OneUse(m_Not(m_OneUse(
+                       m_c_BinOp(Opcode, m_Specific(A), m_Specific(C)))))))
+      return BinaryOperator::Create(
+          FlippedOpcode, Builder.CreateBinOp(Opcode, B, Builder.CreateNot(C)),
+          X);
+  }
+
   return nullptr;
 }
 
@@ -2102,6 +2151,15 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
     Value *Cmp = Builder.CreateICmpSLT(X, Zero, "isneg");
     return SelectInst::Create(Cmp, Y, Zero);
   }
+  // If there's a 'not' of the shifted value, swap the select operands:
+  // ~(iN X s>> (N-1)) & Y --> (X s< 0) ? 0 : Y
+  if (match(&I, m_c_And(m_OneUse(m_Not(
+                            m_AShr(m_Value(X), m_SpecificInt(FullShift)))),
+                        m_Value(Y)))) {
+    Constant *Zero = ConstantInt::getNullValue(Ty);
+    Value *Cmp = Builder.CreateICmpSLT(X, Zero, "isneg");
+    return SelectInst::Create(Cmp, Zero, Y);
+  }
 
   // (~x) & y  -->  ~(x | (~y))  iff that gets rid of inversions
   if (sinkNotIntoOtherHandOfAndOrOr(I))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 7da2669e1d13..14427bd1f2f4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2472,6 +2472,12 @@ static bool isSafeToEliminateVarargsCast(const CallBase &Call,
 Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) {
   if (!CI->getCalledFunction()) return nullptr;
 
+  // Skip optimizing notail and musttail calls so
+  // LibCallSimplifier::optimizeCall doesn't have to preserve those invariants.
+  // LibCallSimplifier::optimizeCall should try to preseve tail calls though.
+  if (CI->isMustTailCall() || CI->isNoTailCall())
+    return nullptr;
+
   auto InstCombineRAUW = [this](Instruction *From, Value *With) {
     replaceInstUsesWith(*From, With);
   };
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 33f217659c01..8df4a4529f47 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -157,7 +157,7 @@ Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
     Amt = Builder.CreateAdd(Amt, Off);
   }
 
-  AllocaInst *New = Builder.CreateAlloca(CastElTy, Amt);
+  AllocaInst *New = Builder.CreateAlloca(CastElTy, AI.getAddressSpace(), Amt);
   New->setAlignment(AI.getAlign());
   New->takeName(&AI);
   New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
@@ -965,13 +965,13 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
   if (match(Src, m_VScale(DL))) {
     if (Trunc.getFunction() &&
         Trunc.getFunction()->hasFnAttribute(Attribute::VScaleRange)) {
-      unsigned MaxVScale = Trunc.getFunction()
-                               ->getFnAttribute(Attribute::VScaleRange)
-                               .getVScaleRangeArgs()
-                               .second;
-      if (MaxVScale > 0 && Log2_32(MaxVScale) < DestWidth) {
-        Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
-        return replaceInstUsesWith(Trunc, VScale);
+      Attribute Attr =
+          Trunc.getFunction()->getFnAttribute(Attribute::VScaleRange);
+      if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) {
+        if (Log2_32(MaxVScale.getValue()) < DestWidth) {
+          Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
+          return replaceInstUsesWith(Trunc, VScale);
+        }
       }
     }
   }
@@ -1337,14 +1337,13 @@ Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
   if (match(Src, m_VScale(DL))) {
     if (CI.getFunction() &&
         CI.getFunction()->hasFnAttribute(Attribute::VScaleRange)) {
-      unsigned MaxVScale = CI.getFunction()
-                               ->getFnAttribute(Attribute::VScaleRange)
-                               .getVScaleRangeArgs()
-                               .second;
-      unsigned TypeWidth = Src->getType()->getScalarSizeInBits();
-      if (MaxVScale > 0 && Log2_32(MaxVScale) < TypeWidth) {
-        Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
-        return replaceInstUsesWith(CI, VScale);
+      Attribute Attr = CI.getFunction()->getFnAttribute(Attribute::VScaleRange);
+      if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) {
+        unsigned TypeWidth = Src->getType()->getScalarSizeInBits();
+        if (Log2_32(MaxVScale.getValue()) < TypeWidth) {
+          Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
+          return replaceInstUsesWith(CI, VScale);
+        }
       }
     }
   }
@@ -1608,13 +1607,12 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
   if (match(Src, m_VScale(DL))) {
     if (CI.getFunction() &&
         CI.getFunction()->hasFnAttribute(Attribute::VScaleRange)) {
-      unsigned MaxVScale = CI.getFunction()
-                               ->getFnAttribute(Attribute::VScaleRange)
-                               .getVScaleRangeArgs()
-                               .second;
-      if (MaxVScale > 0 && Log2_32(MaxVScale) < (SrcBitSize - 1)) {
-        Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
-        return replaceInstUsesWith(CI, VScale);
+      Attribute Attr = CI.getFunction()->getFnAttribute(Attribute::VScaleRange);
+      if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) {
+        if (Log2_32(MaxVScale.getValue()) < (SrcBitSize - 1)) {
+          Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1));
+          return replaceInstUsesWith(CI, VScale);
+        }
       }
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 20c75188ec9f..39b55b028110 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -600,6 +600,7 @@ public:
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
   Instruction *foldVectorSelect(SelectInst &Sel);
+  Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf);
 
   /// Given a binary operator, cast instruction, or select which has a PHI node
   /// as operand #0, see if we can fold the instruction into the PHI (which is
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 79a8a065d02a..0dbfdba353c4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -163,7 +163,7 @@ static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
   uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
   if (!AllocaSize)
     return false;
-  return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()),
+  return isDereferenceableAndAlignedPointer(V, AI->getAlign(),
                                             APInt(64, AllocaSize), DL);
 }
 
@@ -183,7 +183,8 @@ static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
   if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
     if (C->getValue().getActiveBits() <= 64) {
       Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
-      AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
+      AllocaInst *New = IC.Builder.CreateAlloca(NewTy, AI.getAddressSpace(),
+                                                nullptr, AI.getName());
       New->setAlignment(AI.getAlign());
 
       // Scan to the end of the allocation instructions, to skip over a block of
@@ -199,21 +200,13 @@ static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
       Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
       Value *NullIdx = Constant::getNullValue(IdxTy);
       Value *Idx[2] = {NullIdx, NullIdx};
-      Instruction *NewI = GetElementPtrInst::CreateInBounds(
+      Instruction *GEP = GetElementPtrInst::CreateInBounds(
           NewTy, New, Idx, New->getName() + ".sub");
-      IC.InsertNewInstBefore(NewI, *It);
-
-      // Gracefully handle allocas in other address spaces.
-      if (AI.getType()->getPointerAddressSpace() !=
-          NewI->getType()->getPointerAddressSpace()) {
-        NewI =
-            CastInst::CreatePointerBitCastOrAddrSpaceCast(NewI, AI.getType());
-        IC.InsertNewInstBefore(NewI, *It);
-      }
+      IC.InsertNewInstBefore(GEP, *It);
 
       // Now make everything use the getelementptr instead of the original
       // allocation.
-      return IC.replaceInstUsesWith(AI, NewI);
+      return IC.replaceInstUsesWith(AI, GEP);
     }
   }
 
@@ -640,7 +633,6 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
     return nullptr;
 
   StringRef Name = LI.getName();
-  assert(LI.getAlignment() && "Alignment must be set at this point");
 
   if (auto *ST = dyn_cast<StructType>(T)) {
     // If the struct only have one element, we unpack.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 779d298da7a4..aca7ec8d7325 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -755,6 +755,15 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
   if (simplifyDivRemOfSelectWithZeroOp(I))
     return &I;
 
+  // If the divisor is a select-of-constants, try to constant fold all div ops:
+  // C / (select Cond, TrueC, FalseC) --> select Cond, (C / TrueC), (C / FalseC)
+  // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds.
+  if (match(Op0, m_ImmConstant()) &&
+      match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) {
+    if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1)))
+      return R;
+  }
+
   const APInt *C2;
   if (match(Op1, m_APInt(C2))) {
     Value *X;
@@ -1461,6 +1470,15 @@ Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) {
   if (simplifyDivRemOfSelectWithZeroOp(I))
     return &I;
 
+  // If the divisor is a select-of-constants, try to constant fold all rem ops:
+  // C % (select Cond, TrueC, FalseC) --> select Cond, (C % TrueC), (C % FalseC)
+  // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds.
+  if (match(Op0, m_ImmConstant()) &&
+      match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) {
+    if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1)))
+      return R;
+  }
+
   if (isa<Constant>(Op1)) {
     if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
       if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 35739c3b9a21..30f6aab2114b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -664,10 +664,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
     return nullptr;
 
   // When processing loads, we need to propagate two bits of information to the
-  // sunk load: whether it is volatile, and what its alignment is.  We currently
-  // don't sink loads when some have their alignment specified and some don't.
-  // visitLoadInst will propagate an alignment onto the load when TD is around,
-  // and if TD isn't around, we can't handle the mixed case.
+  // sunk load: whether it is volatile, and what its alignment is.
   bool isVolatile = FirstLI->isVolatile();
   Align LoadAlignment = FirstLI->getAlign();
   unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
@@ -699,7 +696,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
         !isSafeAndProfitableToSinkLoad(LI))
       return nullptr;
 
-    LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign()));
+    LoadAlignment = std::min(LoadAlignment, LI->getAlign());
 
     // If the PHI is of volatile loads and the load block has multiple
     // successors, sinking it would remove a load of the volatile value from
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 518d3952dce5..a6d6b5199105 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1482,7 +1482,12 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
   if (C0->getType() != Sel.getType())
     return nullptr;
 
-  // FIXME: are there any magic icmp predicate+constant pairs we must not touch?
+  // ULT with 'add' of a constant is canonical. See foldICmpAddConstant().
+  // FIXME: Are there more magic icmp predicate+constant pairs we must avoid?
+  //        Or should we just abandon this transform entirely?
+  if (Pred == CmpInst::ICMP_ULT && match(X, m_Add(m_Value(), m_Constant())))
+    return nullptr;
+
 
   Value *SelVal0, *SelVal1; // We do not care which one is from where.
   match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1)));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index e357a9da8b12..4dc712f32536 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1595,12 +1595,6 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
     simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
     simplifyAndSetOp(I, 1, DemandedElts, UndefElts2);
 
-    // Any change to an instruction with potential poison must clear those flags
-    // because we can not guarantee those constraints now. Other analysis may
-    // determine that it is safe to re-apply the flags.
-    if (MadeChange)
-      BO->dropPoisonGeneratingFlags();
-
     // Output elements are undefined if both are undefined. Consider things
     // like undef & 0. The result is known zero, not undef.
     UndefElts &= UndefElts2;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 32e537897140..c6a4602e59e3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -363,6 +363,18 @@ static APInt findDemandedEltsByAllUsers(Value *V) {
   return UnionUsedElts;
 }
 
+/// Given a constant index for a extractelement or insertelement instruction,
+/// return it with the canonical type if it isn't already canonical.  We
+/// arbitrarily pick 64 bit as our canonical type.  The actual bitwidth doesn't
+/// matter, we just want a consistent type to simplify CSE.
+ConstantInt *getPreferredVectorIndex(ConstantInt *IndexC) {
+  const unsigned IndexBW = IndexC->getType()->getBitWidth();
+  if (IndexBW == 64 || IndexC->getValue().getActiveBits() > 64)
+    return nullptr;
+  return ConstantInt::get(IndexC->getContext(),
+                          IndexC->getValue().zextOrTrunc(64));
+}
+
 Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
   Value *SrcVec = EI.getVectorOperand();
   Value *Index = EI.getIndexOperand();
@@ -374,6 +386,10 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
   // find a previously computed scalar that was inserted into the vector.
   auto *IndexC = dyn_cast<ConstantInt>(Index);
   if (IndexC) {
+    // Canonicalize type of constant indices to i64 to simplify CSE
+    if (auto *NewIdx = getPreferredVectorIndex(IndexC))
+      return replaceOperand(EI, 1, NewIdx);
+
     ElementCount EC = EI.getVectorOperandType()->getElementCount();
     unsigned NumElts = EC.getKnownMinValue();
 
@@ -401,37 +417,6 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
     if (!EC.isScalable() && IndexC->getValue().uge(NumElts))
       return nullptr;
 
-    // This instruction only demands the single element from the input vector.
-    // Skip for scalable type, the number of elements is unknown at
-    // compile-time.
-    if (!EC.isScalable() && NumElts != 1) {
-      // If the input vector has a single use, simplify it based on this use
-      // property.
-      if (SrcVec->hasOneUse()) {
-        APInt UndefElts(NumElts, 0);
-        APInt DemandedElts(NumElts, 0);
-        DemandedElts.setBit(IndexC->getZExtValue());
-        if (Value *V =
-                SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts))
-          return replaceOperand(EI, 0, V);
-      } else {
-        // If the input vector has multiple uses, simplify it based on a union
-        // of all elements used.
-        APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
-        if (!DemandedElts.isAllOnes()) {
-          APInt UndefElts(NumElts, 0);
-          if (Value *V = SimplifyDemandedVectorElts(
-                  SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
-                  true /* AllowMultipleUsers */)) {
-            if (V != SrcVec) {
-              SrcVec->replaceAllUsesWith(V);
-              return &EI;
-            }
-          }
-        }
-      }
-    }
-
     if (Instruction *I = foldBitcastExtElt(EI))
       return I;
 
@@ -473,11 +458,9 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
 
   if (auto *I = dyn_cast<Instruction>(SrcVec)) {
     if (auto *IE = dyn_cast<InsertElementInst>(I)) {
-      // Extracting the inserted element?
-      if (IE->getOperand(2) == Index)
-        return replaceInstUsesWith(EI, IE->getOperand(1));
-      // If the inserted and extracted elements are constants, they must not
-      // be the same value, extract from the pre-inserted value instead.
+      // instsimplify already handled the case where the indices are constants
+      // and equal by value, if both are constants, they must not be the same
+      // value, extract from the pre-inserted value instead.
       if (isa<Constant>(IE->getOperand(2)) && IndexC)
         return replaceOperand(EI, 0, IE->getOperand(0));
     } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
@@ -497,30 +480,27 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
             llvm::count_if(GEP->operands(), [](const Value *V) {
               return isa<VectorType>(V->getType());
             });
-        if (VectorOps > 1)
-          return nullptr;
-        assert(VectorOps == 1 && "Expected exactly one vector GEP operand!");
+        if (VectorOps == 1) {
+          Value *NewPtr = GEP->getPointerOperand();
+          if (isa<VectorType>(NewPtr->getType()))
+            NewPtr = Builder.CreateExtractElement(NewPtr, IndexC);
 
-        Value *NewPtr = GEP->getPointerOperand();
-        if (isa<VectorType>(NewPtr->getType()))
-          NewPtr = Builder.CreateExtractElement(NewPtr, IndexC);
+          SmallVector<Value *> NewOps;
+          for (unsigned I = 1; I != GEP->getNumOperands(); ++I) {
+            Value *Op = GEP->getOperand(I);
+            if (isa<VectorType>(Op->getType()))
+              NewOps.push_back(Builder.CreateExtractElement(Op, IndexC));
+            else
+              NewOps.push_back(Op);
+          }
 
-        SmallVector<Value *> NewOps;
-        for (unsigned I = 1; I != GEP->getNumOperands(); ++I) {
-          Value *Op = GEP->getOperand(I);
-          if (isa<VectorType>(Op->getType()))
-            NewOps.push_back(Builder.CreateExtractElement(Op, IndexC));
-          else
-            NewOps.push_back(Op);
+          GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+              cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr,
+              NewOps);
+          NewGEP->setIsInBounds(GEP->isInBounds());
+          return NewGEP;
         }
-
-        GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
-            cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr,
-            NewOps);
-        NewGEP->setIsInBounds(GEP->isInBounds());
-        return NewGEP;
       }
-      return nullptr;
     } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
       // If this is extracting an element from a shufflevector, figure out where
       // it came from and extract from the appropriate input element instead.
@@ -554,6 +534,44 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
       }
     }
   }
+
+  // Run demanded elements after other transforms as this can drop flags on
+  // binops.  If there's two paths to the same final result, we prefer the
+  // one which doesn't force us to drop flags.
+  if (IndexC) {
+    ElementCount EC = EI.getVectorOperandType()->getElementCount();
+    unsigned NumElts = EC.getKnownMinValue();
+    // This instruction only demands the single element from the input vector.
+    // Skip for scalable type, the number of elements is unknown at
+    // compile-time.
+    if (!EC.isScalable() && NumElts != 1) {
+      // If the input vector has a single use, simplify it based on this use
+      // property.
+      if (SrcVec->hasOneUse()) {
+        APInt UndefElts(NumElts, 0);
+        APInt DemandedElts(NumElts, 0);
+        DemandedElts.setBit(IndexC->getZExtValue());
+        if (Value *V =
+                SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts))
+          return replaceOperand(EI, 0, V);
+      } else {
+        // If the input vector has multiple uses, simplify it based on a union
+        // of all elements used.
+        APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
+        if (!DemandedElts.isAllOnes()) {
+          APInt UndefElts(NumElts, 0);
+          if (Value *V = SimplifyDemandedVectorElts(
+                  SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
+                  true /* AllowMultipleUsers */)) {
+            if (V != SrcVec) {
+              SrcVec->replaceAllUsesWith(V);
+              return &EI;
+            }
+          }
+        }
+      }
+    }
+  }
   return nullptr;
 }
 
@@ -1476,6 +1494,11 @@ Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
           VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE)))
     return replaceInstUsesWith(IE, V);
 
+  // Canonicalize type of constant indices to i64 to simplify CSE
+  if (auto *IndexC = dyn_cast<ConstantInt>(IdxOp))
+    if (auto *NewIdx = getPreferredVectorIndex(IndexC))
+      return replaceOperand(IE, 2, NewIdx);
+
   // If the scalar is bitcast and inserted into undef, do the insert in the
   // source type followed by bitcast.
   // TODO: Generalize for insert into any constant, not just undef?
@@ -2008,9 +2031,7 @@ static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
 }
 
 /// Try to fold shuffles that are the equivalent of a vector select.
-static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
-                                      InstCombiner::BuilderTy &Builder,
-                                      const DataLayout &DL) {
+Instruction *InstCombinerImpl::foldSelectShuffle(ShuffleVectorInst &Shuf) {
   if (!Shuf.isSelect())
     return nullptr;
 
@@ -2118,21 +2139,23 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
     V = Builder.CreateShuffleVector(X, Y, Mask);
   }
 
-  Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) :
-                                         BinaryOperator::Create(BOpc, NewC, V);
+  Value *NewBO = ConstantsAreOp1 ? Builder.CreateBinOp(BOpc, V, NewC) :
+                                   Builder.CreateBinOp(BOpc, NewC, V);
 
   // Flags are intersected from the 2 source binops. But there are 2 exceptions:
   // 1. If we changed an opcode, poison conditions might have changed.
   // 2. If the shuffle had undef mask elements, the new binop might have undefs
   //    where the original code did not. But if we already made a safe constant,
   //    then there's no danger.
-  NewBO->copyIRFlags(B0);
-  NewBO->andIRFlags(B1);
-  if (DropNSW)
-    NewBO->setHasNoSignedWrap(false);
-  if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
-    NewBO->dropPoisonGeneratingFlags();
-  return NewBO;
+  if (auto *NewI = dyn_cast<Instruction>(NewBO)) {
+    NewI->copyIRFlags(B0);
+    NewI->andIRFlags(B1);
+    if (DropNSW)
+      NewI->setHasNoSignedWrap(false);
+    if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
+      NewI->dropPoisonGeneratingFlags();
+  }
+  return replaceInstUsesWith(Shuf, NewBO);
 }
 
 /// Convert a narrowing shuffle of a bitcasted vector into a vector truncate.
@@ -2497,7 +2520,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (Instruction *I = canonicalizeInsertSplat(SVI, Builder))
     return I;
 
-  if (Instruction *I = foldSelectShuffle(SVI, Builder, DL))
+  if (Instruction *I = foldSelectShuffle(SVI))
     return I;
 
   if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian()))
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1f81624f79e7..eb5eadba194d 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2546,7 +2546,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   return nullptr;
 }
 
-static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI,
+static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo &TLI,
                                          Instruction *AI) {
   if (isa<ConstantPointerNull>(V))
     return true;
@@ -2557,12 +2557,34 @@ static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI,
   // through bitcasts of V can cause
   // the result statement below to be true, even when AI and V (ex:
   // i8* ->i32* ->i8* of AI) are the same allocations.
-  return isAllocLikeFn(V, TLI) && V != AI;
+  return isAllocLikeFn(V, &TLI) && V != AI;
+}
+
+/// Given a call CB which uses an address UsedV, return true if we can prove the
+/// call's only possible effect is storing to V.
+static bool isRemovableWrite(CallBase &CB, Value *UsedV,
+                             const TargetLibraryInfo &TLI) {
+  if (!CB.use_empty())
+    // TODO: add recursion if returned attribute is present
+    return false;
+
+  if (CB.isTerminator())
+    // TODO: remove implementation restriction
+    return false;
+
+  if (!CB.willReturn() || !CB.doesNotThrow())
+    return false;
+
+  // If the only possible side effect of the call is writing to the alloca,
+  // and the result isn't used, we can safely remove any reads implied by the
+  // call including those which might read the alloca itself.
+  Optional<MemoryLocation> Dest = MemoryLocation::getForDest(&CB, TLI);
+  return Dest && Dest->Ptr == UsedV;
 }
 
 static bool isAllocSiteRemovable(Instruction *AI,
                                  SmallVectorImpl<WeakTrackingVH> &Users,
-                                 const TargetLibraryInfo *TLI) {
+                                 const TargetLibraryInfo &TLI) {
   SmallVector<Instruction*, 4> Worklist;
   Worklist.push_back(AI);
 
@@ -2627,12 +2649,17 @@ static bool isAllocSiteRemovable(Instruction *AI,
           }
         }
 
-        if (isFreeCall(I, TLI)) {
+        if (isRemovableWrite(*cast<CallBase>(I), PI, TLI)) {
+          Users.emplace_back(I);
+          continue;
+        }
+
+        if (isFreeCall(I, &TLI)) {
           Users.emplace_back(I);
           continue;
         }
 
-        if (isReallocLikeFn(I, TLI, true)) {
+        if (isReallocLikeFn(I, &TLI, true)) {
           Users.emplace_back(I);
           Worklist.push_back(I);
           continue;
@@ -2676,7 +2703,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
     DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
   }
 
-  if (isAllocSiteRemovable(&MI, Users, &TLI)) {
+  if (isAllocSiteRemovable(&MI, Users, TLI)) {
     for (unsigned i = 0, e = Users.size(); i != e; ++i) {
       // Lowering all @llvm.objectsize calls first because they may
       // use a bitcast/GEP of the alloca we are removing.
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 38c219ce3465..9f26b37bbc79 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -232,6 +232,12 @@ static cl::opt<int> ClTrackOrigins("dfsan-track-origins",
                                    cl::desc("Track origins of labels"),
                                    cl::Hidden, cl::init(0));
 
+static cl::opt<bool> ClIgnorePersonalityRoutine(
+    "dfsan-ignore-personality-routine",
+    cl::desc("If a personality routine is marked uninstrumented from the ABI "
+             "list, do not create a wrapper for it."),
+    cl::Hidden, cl::init(false));
+
 static StringRef getGlobalTypeString(const GlobalValue &G) {
   // Types of GlobalVariables are always pointer types.
   Type *GType = G.getValueType();
@@ -1115,7 +1121,7 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
 
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
   if (F->isVarArg()) {
-    NewF->removeFnAttrs(AttrBuilder().addAttribute("split-stack"));
+    NewF->removeFnAttr("split-stack");
     CallInst::Create(DFSanVarargWrapperFn,
                      IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
                      BB);
@@ -1357,9 +1363,24 @@ bool DataFlowSanitizer::runImpl(Module &M) {
   std::vector<Function *> FnsToInstrument;
   SmallPtrSet<Function *, 2> FnsWithNativeABI;
   SmallPtrSet<Function *, 2> FnsWithForceZeroLabel;
+  SmallPtrSet<Constant *, 1> PersonalityFns;
   for (Function &F : M)
-    if (!F.isIntrinsic() && !DFSanRuntimeFunctions.contains(&F))
+    if (!F.isIntrinsic() && !DFSanRuntimeFunctions.contains(&F)) {
       FnsToInstrument.push_back(&F);
+      if (F.hasPersonalityFn())
+        PersonalityFns.insert(F.getPersonalityFn()->stripPointerCasts());
+    }
+
+  if (ClIgnorePersonalityRoutine) {
+    for (auto *C : PersonalityFns) {
+      assert(isa<Function>(C) && "Personality routine is not a function!");
+      Function *F = cast<Function>(C);
+      if (!isInstrumented(F))
+        FnsToInstrument.erase(
+            std::remove(FnsToInstrument.begin(), FnsToInstrument.end(), F),
+            FnsToInstrument.end());
+    }
+  }
 
   // Give function aliases prefixes when necessary, and build wrappers where the
   // instrumentedness is inconsistent.
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index d1d3b8ffdf7a..de34348606ef 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -26,7 +26,9 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -40,6 +42,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
@@ -57,6 +60,13 @@ using namespace llvm;
 
 #define DEBUG_TYPE "instrprof"
 
+namespace llvm {
+cl::opt<bool>
+    DebugInfoCorrelate("debug-info-correlate", cl::ZeroOrMore,
+                       cl::desc("Use debug info to correlate profiles."),
+                       cl::init(false));
+} // namespace llvm
+
 namespace {
 
 cl::opt<bool> DoHashBasedCounterSplit(
@@ -641,6 +651,12 @@ void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
 }
 
 void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
+  // TODO: Value profiling heavily depends on the data section which is omitted
+  // in lightweight mode. We need to move the value profile pointer to the
+  // Counter struct to get this working.
+  assert(
+      !DebugInfoCorrelate &&
+      "Value profiling is not yet supported with lightweight instrumentation");
   GlobalVariable *Name = Ind->getName();
   auto It = ProfileDataMap.find(Name);
   assert(It != ProfileDataMap.end() && It->second.DataVar &&
@@ -855,6 +871,12 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
   GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
 
+  // Use internal rather than private linkage so the counter variable shows up
+  // in the symbol table when using debug info for correlation.
+  if (DebugInfoCorrelate && TT.isOSBinFormatMachO() &&
+      Linkage == GlobalValue::PrivateLinkage)
+    Linkage = GlobalValue::InternalLinkage;
+
   // Due to the limitation of binder as of 2021/09/28, the duplicate weak
   // symbols in the same csect won't be discarded. When there are duplicate weak
   // symbols, we can NOT guarantee that the relocations get resolved to the
@@ -916,6 +938,42 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   MaybeSetComdat(CounterPtr);
   CounterPtr->setLinkage(Linkage);
   PD.RegionCounters = CounterPtr;
+  if (DebugInfoCorrelate) {
+    if (auto *SP = Fn->getSubprogram()) {
+      DIBuilder DB(*M, true, SP->getUnit());
+      Metadata *FunctionNameAnnotation[] = {
+          MDString::get(Ctx, InstrProfCorrelator::FunctionNameAttributeName),
+          MDString::get(Ctx, getPGOFuncNameVarInitializer(NamePtr)),
+      };
+      Metadata *CFGHashAnnotation[] = {
+          MDString::get(Ctx, InstrProfCorrelator::CFGHashAttributeName),
+          ConstantAsMetadata::get(Inc->getHash()),
+      };
+      Metadata *NumCountersAnnotation[] = {
+          MDString::get(Ctx, InstrProfCorrelator::NumCountersAttributeName),
+          ConstantAsMetadata::get(Inc->getNumCounters()),
+      };
+      auto Annotations = DB.getOrCreateArray({
+          MDNode::get(Ctx, FunctionNameAnnotation),
+          MDNode::get(Ctx, CFGHashAnnotation),
+          MDNode::get(Ctx, NumCountersAnnotation),
+      });
+      auto *DICounter = DB.createGlobalVariableExpression(
+          SP, CounterPtr->getName(), /*LinkageName=*/StringRef(), SP->getFile(),
+          /*LineNo=*/0, DB.createUnspecifiedType("Profile Data Type"),
+          CounterPtr->hasLocalLinkage(), /*IsDefined=*/true, /*Expr=*/nullptr,
+          /*Decl=*/nullptr, /*TemplateParams=*/nullptr, /*AlignInBits=*/0,
+          Annotations);
+      CounterPtr->addDebugInfo(DICounter);
+      DB.finalize();
+    } else {
+      std::string Msg = ("Missing debug info for function " + Fn->getName() +
+                         "; required for profile correlation.")
+                            .str();
+      Ctx.diagnose(
+          DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    }
+  }
 
   auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
   // Allocate statically the array of pointers to value profile nodes for
@@ -939,6 +997,9 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
         ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
   }
 
+  if (DebugInfoCorrelate)
+    return PD.RegionCounters;
+
   // Create data variable.
   auto *IntPtrTy = M->getDataLayout().getIntPtrType(M->getContext());
   auto *Int16Ty = Type::getInt16Ty(Ctx);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4d15b784f486..446e601cd4d7 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -307,6 +307,11 @@ static cl::opt<bool>
                   cl::desc("Enable KernelMemorySanitizer instrumentation"),
                   cl::Hidden, cl::init(false));
 
+static cl::opt<bool>
+    ClDisableChecks("msan-disable-checks",
+                    cl::desc("Apply no_sanitize to the whole file"), cl::Hidden,
+                    cl::init(false));
+
 // This is an experiment to enable handling of cases where shadow is a non-zero
 // compile-time constant. For some unexplainable reason they were silently
 // ignored in the instrumentation.
@@ -1095,7 +1100,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   MemorySanitizerVisitor(Function &F, MemorySanitizer &MS,
                          const TargetLibraryInfo &TLI)
       : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)), TLI(&TLI) {
-    bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory);
+    bool SanitizeFunction =
+        F.hasFnAttribute(Attribute::SanitizeMemory) && !ClDisableChecks;
     InsertChecks = SanitizeFunction;
     PropagateShadow = SanitizeFunction;
     PoisonStack = SanitizeFunction && ClPoisonStack;
@@ -1214,7 +1220,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val);
       Value *ShadowPtr, *OriginPtr;
       Type *ShadowTy = Shadow->getType();
-      const Align Alignment = assumeAligned(SI->getAlignment());
+      const Align Alignment = SI->getAlign();
       const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
       std::tie(ShadowPtr, OriginPtr) =
           getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true);
@@ -3887,8 +3893,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           &I, IRB, IRB.getInt8Ty(), Align(1), /*isStore*/ true);
 
       Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
-      IRB.CreateMemSet(ShadowBase, PoisonValue, Len,
-                       MaybeAlign(I.getAlignment()));
+      IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlign());
     }
 
     if (PoisonStack && MS.TrackOrigins) {
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index af5946325bbb..b6ba1fc2132c 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -273,14 +273,14 @@ static cl::opt<bool> PGOVerifyBFI(
              "internal option -pass-remakrs-analysis=pgo."));
 
 static cl::opt<unsigned> PGOVerifyBFIRatio(
-    "pgo-verify-bfi-ratio", cl::init(5), cl::Hidden,
-    cl::desc("Set the threshold for pgo-verify-big -- only print out "
+    "pgo-verify-bfi-ratio", cl::init(2), cl::Hidden,
+    cl::desc("Set the threshold for pgo-verify-bfi:  only print out "
              "mismatched BFI if the difference percentage is greater than "
              "this value (in percentage)."));
 
 static cl::opt<unsigned> PGOVerifyBFICutoff(
-    "pgo-verify-bfi-cutoff", cl::init(1), cl::Hidden,
-    cl::desc("Set the threshold for pgo-verify-bfi -- skip the counts whose "
+    "pgo-verify-bfi-cutoff", cl::init(5), cl::Hidden,
+    cl::desc("Set the threshold for pgo-verify-bfi: skip the counts whose "
              "profile count value is below."));
 
 namespace llvm {
@@ -291,6 +291,8 @@ extern cl::opt<PGOViewCountsType> PGOViewCounts;
 // Command line option to specify the name of the function for CFG dump
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
 extern cl::opt<std::string> ViewBlockFreqFuncName;
+
+extern cl::opt<bool> DebugInfoCorrelate;
 } // namespace llvm
 
 static cl::opt<bool>
@@ -467,8 +469,9 @@ private:
     createProfileFileNameVar(M, InstrProfileOutput);
     // The variable in a comdat may be discarded by LTO. Ensure the
     // declaration will be retained.
-    appendToCompilerUsed(
-        M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, PGOInstrumentEntry));
+    appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true,
+                                                        PGOInstrumentEntry,
+                                                        DebugInfoCorrelate));
     return false;
   }
   std::string InstrProfileOutput;
@@ -1616,7 +1619,8 @@ static bool InstrumentAllFunctions(
   // For the context-sensitve instrumentation, we should have a separated pass
   // (before LTO/ThinLTO linking) to create these variables.
   if (!IsCS)
-    createIRLevelProfileFlagVar(M, /*IsCS=*/false, PGOInstrumentEntry);
+    createIRLevelProfileFlagVar(M, /*IsCS=*/false, PGOInstrumentEntry,
+                                DebugInfoCorrelate);
   std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
   collectComdatMembers(M, ComdatMembers);
 
@@ -1638,8 +1642,9 @@ PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) {
   createProfileFileNameVar(M, CSInstrName);
   // The variable in a comdat may be discarded by LTO. Ensure the declaration
   // will be retained.
-  appendToCompilerUsed(
-      M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, PGOInstrumentEntry));
+  appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true,
+                                                      PGOInstrumentEntry,
+                                                      DebugInfoCorrelate));
   return PreservedAnalyses::all();
 }
 
@@ -1774,7 +1779,7 @@ static void verifyFuncBFI(PGOUseFunc &Func, LoopInfo &LI,
       uint64_t Diff = (BFICountValue >= CountValue)
                           ? BFICountValue - CountValue
                           : CountValue - BFICountValue;
-      if (Diff < CountValue / 100 * PGOVerifyBFIRatio)
+      if (Diff <= CountValue / 100 * PGOVerifyBFIRatio)
         continue;
     }
     BBMisMatchNum++;
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 27f54f8026e1..37a7053d778e 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -271,8 +271,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
   // subtree of BB (subtree not including the BB itself).
   DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
   InsertPtsMap.reserve(Orders.size() + 1);
-  for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
-    BasicBlock *Node = *RIt;
+  for (BasicBlock *Node : llvm::reverse(Orders)) {
     bool NodeInBBs = BBs.count(Node);
     auto &InsertPts = InsertPtsMap[Node].first;
     BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 8c4523206070..dda1a2f08076 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -588,7 +588,7 @@ struct AllSwitchPaths {
         PrevBB = BB;
       }
 
-      if (TPath.isExitValueSet())
+      if (TPath.isExitValueSet() && isSupported(TPath))
         TPaths.push_back(TPath);
     }
   }
@@ -683,6 +683,62 @@ private:
     return Res;
   }
 
+  /// The determinator BB should precede the switch-defining BB.
+  ///
+  /// Otherwise, it is possible that the state defined in the determinator block
+  /// defines the state for the next iteration of the loop, rather than for the
+  /// current one.
+  ///
+  /// Currently supported paths:
+  /// \code
+  /// < switch bb1 determ def > [ 42, determ ]
+  /// < switch_and_def bb1 determ > [ 42, determ ]
+  /// < switch_and_def_and_determ bb1 > [ 42, switch_and_def_and_determ ]
+  /// \endcode
+  ///
+  /// Unsupported paths:
+  /// \code
+  /// < switch bb1 def determ > [ 43, determ ]
+  /// < switch_and_determ bb1 def > [ 43, switch_and_determ ]
+  /// \endcode
+  bool isSupported(const ThreadingPath &TPath) {
+    Instruction *SwitchCondI = dyn_cast<Instruction>(Switch->getCondition());
+    assert(SwitchCondI);
+    if (!SwitchCondI)
+      return false;
+
+    const BasicBlock *SwitchCondDefBB = SwitchCondI->getParent();
+    const BasicBlock *SwitchCondUseBB = Switch->getParent();
+    const BasicBlock *DeterminatorBB = TPath.getDeterminatorBB();
+
+    assert(
+        SwitchCondUseBB == TPath.getPath().front() &&
+        "The first BB in a threading path should have the switch instruction");
+    if (SwitchCondUseBB != TPath.getPath().front())
+      return false;
+
+    // Make DeterminatorBB the first element in Path.
+    PathType Path = TPath.getPath();
+    auto ItDet = std::find(Path.begin(), Path.end(), DeterminatorBB);
+    std::rotate(Path.begin(), ItDet, Path.end());
+
+    bool IsDetBBSeen = false;
+    bool IsDefBBSeen = false;
+    bool IsUseBBSeen = false;
+    for (BasicBlock *BB : Path) {
+      if (BB == DeterminatorBB)
+        IsDetBBSeen = true;
+      if (BB == SwitchCondDefBB)
+        IsDefBBSeen = true;
+      if (BB == SwitchCondUseBB)
+        IsUseBBSeen = true;
+      if (IsDetBBSeen && IsUseBBSeen && !IsDefBBSeen)
+        return false;
+    }
+
+    return true;
+  }
+
   SwitchInst *Switch;
   BasicBlock *SwitchBlock;
   OptimizationRemarkEmitter *ORE;
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index e0d3a6accadd..eadbb4293539 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -175,44 +175,6 @@ static cl::opt<bool>
 using OverlapIntervalsTy = std::map<int64_t, int64_t>;
 using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
 
-/// If the value of this instruction and the memory it writes to is unused, may
-/// we delete this instruction?
-static bool isRemovable(Instruction *I) {
-  // Don't remove volatile/atomic stores.
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->isUnordered();
-
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-    switch (II->getIntrinsicID()) {
-    default: llvm_unreachable("Does not have LocForWrite");
-    case Intrinsic::lifetime_end:
-      // Never remove dead lifetime_end's, e.g. because it is followed by a
-      // free.
-      return false;
-    case Intrinsic::init_trampoline:
-      // Always safe to remove init_trampoline.
-      return true;
-    case Intrinsic::memset:
-    case Intrinsic::memmove:
-    case Intrinsic::memcpy:
-    case Intrinsic::memcpy_inline:
-      // Don't remove volatile memory intrinsics.
-      return !cast<MemIntrinsic>(II)->isVolatile();
-    case Intrinsic::memcpy_element_unordered_atomic:
-    case Intrinsic::memmove_element_unordered_atomic:
-    case Intrinsic::memset_element_unordered_atomic:
-    case Intrinsic::masked_store:
-      return true;
-    }
-  }
-
-  // note: only get here for calls with analyzable writes - i.e. libcalls
-  if (auto *CB = dyn_cast<CallBase>(I))
-    return CB->use_empty();
-
-  return false;
-}
-
 /// Returns true if the end of this instruction can be safely shortened in
 /// length.
 static bool isShortenableAtTheEnd(Instruction *I) {
@@ -835,7 +797,7 @@ struct DSEState {
 
         auto *MD = dyn_cast_or_null<MemoryDef>(MA);
         if (MD && MemDefs.size() < MemorySSADefsPerBlockLimit &&
-            (getLocForWriteEx(&I) || isMemTerminatorInst(&I)))
+            (getLocForWrite(&I) || isMemTerminatorInst(&I)))
           MemDefs.push_back(MD);
       }
     }
@@ -1022,48 +984,39 @@ struct DSEState {
     return I.first->second;
   }
 
-  Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
+  Optional<MemoryLocation> getLocForWrite(Instruction *I) const {
     if (!I->mayWriteToMemory())
       return None;
 
-    if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I))
-      return {MemoryLocation::getForDest(MTI)};
+    if (auto *CB = dyn_cast<CallBase>(I))
+      return MemoryLocation::getForDest(CB, TLI);
+
+    return MemoryLocation::getOrNone(I);
+  }
+
+  /// Assuming this instruction has a dead analyzable write, can we delete
+  /// this instruction?
+  bool isRemovable(Instruction *I) {
+    assert(getLocForWrite(I) && "Must have analyzable write");
+
+    // Don't remove volatile/atomic stores.
+    if (StoreInst *SI = dyn_cast<StoreInst>(I))
+      return SI->isUnordered();
 
     if (auto *CB = dyn_cast<CallBase>(I)) {
-      // If the functions may write to memory we do not know about, bail out.
-      if (!CB->onlyAccessesArgMemory() &&
-          !CB->onlyAccessesInaccessibleMemOrArgMem())
-        return None;
+      // Don't remove volatile memory intrinsics.
+      if (auto *MI = dyn_cast<MemIntrinsic>(CB))
+        return !MI->isVolatile();
 
-      LibFunc LF;
-      if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
-        switch (LF) {
-        case LibFunc_strncpy:
-          if (const auto *Len = dyn_cast<ConstantInt>(CB->getArgOperand(2)))
-            return MemoryLocation(CB->getArgOperand(0),
-                                  LocationSize::precise(Len->getZExtValue()),
-                                  CB->getAAMetadata());
-          LLVM_FALLTHROUGH;
-        case LibFunc_strcpy:
-        case LibFunc_strcat:
-        case LibFunc_strncat:
-          return {MemoryLocation::getAfter(CB->getArgOperand(0))};
-        default:
-          break;
-        }
-      }
-      switch (CB->getIntrinsicID()) {
-      case Intrinsic::init_trampoline:
-        return {MemoryLocation::getAfter(CB->getArgOperand(0))};
-      case Intrinsic::masked_store:
-        return {MemoryLocation::getForArgument(CB, 1, TLI)};
-      default:
-        break;
-      }
-      return None;
+      // Never remove dead lifetime intrinsics, e.g. because they are followed
+      // by a free.
+      if (CB->isLifetimeStartOrEnd())
+        return false;
+
+      return CB->use_empty() && CB->willReturn() && CB->doesNotThrow();
     }
 
-    return MemoryLocation::getOrNone(I);
+    return false;
   }
 
   /// Returns true if \p UseInst completely overwrites \p DefLoc
@@ -1081,7 +1034,7 @@ struct DSEState {
         return false;
 
     int64_t InstWriteOffset, DepWriteOffset;
-    if (auto CC = getLocForWriteEx(UseInst))
+    if (auto CC = getLocForWrite(UseInst))
       return isOverwrite(UseInst, DefInst, *CC, DefLoc, InstWriteOffset,
                          DepWriteOffset) == OW_Complete;
     return false;
@@ -1093,7 +1046,7 @@ struct DSEState {
                       << *Def->getMemoryInst()
                       << ") is at the end the function \n");
 
-    auto MaybeLoc = getLocForWriteEx(Def->getMemoryInst());
+    auto MaybeLoc = getLocForWrite(Def->getMemoryInst());
     if (!MaybeLoc) {
       LLVM_DEBUG(dbgs() << "  ... could not get location for write.\n");
       return false;
@@ -1237,30 +1190,14 @@ struct DSEState {
   /// loop. In particular, this guarantees that it only references a single
   /// MemoryLocation during execution of the containing function.
   bool isGuaranteedLoopInvariant(const Value *Ptr) {
-    auto IsGuaranteedLoopInvariantBase = [this](const Value *Ptr) {
-      Ptr = Ptr->stripPointerCasts();
-      if (auto *I = dyn_cast<Instruction>(Ptr)) {
-        if (isa<AllocaInst>(Ptr))
-          return true;
-
-        if (isAllocLikeFn(I, &TLI))
-          return true;
-
-        return false;
-      }
-      return true;
-    };
-
     Ptr = Ptr->stripPointerCasts();
-    if (auto *I = dyn_cast<Instruction>(Ptr)) {
-      if (I->getParent()->isEntryBlock())
-        return true;
-    }
-    if (auto *GEP = dyn_cast<GEPOperator>(Ptr)) {
-      return IsGuaranteedLoopInvariantBase(GEP->getPointerOperand()) &&
-             GEP->hasAllConstantIndices();
-    }
-    return IsGuaranteedLoopInvariantBase(Ptr);
+    if (auto *GEP = dyn_cast<GEPOperator>(Ptr))
+      if (GEP->hasAllConstantIndices())
+        Ptr = GEP->getPointerOperand()->stripPointerCasts();
+
+    if (auto *I = dyn_cast<Instruction>(Ptr))
+      return I->getParent()->isEntryBlock();
+    return true;
   }
 
   // Find a MemoryDef writing to \p KillingLoc and dominating \p StartAccess,
@@ -1372,7 +1309,7 @@ struct DSEState {
 
       // If Current does not have an analyzable write location or is not
       // removable, skip it.
-      CurrentLoc = getLocForWriteEx(CurrentI);
+      CurrentLoc = getLocForWrite(CurrentI);
       if (!CurrentLoc || !isRemovable(CurrentI)) {
         CanOptimize = false;
         continue;
@@ -1729,14 +1666,13 @@ struct DSEState {
     LLVM_DEBUG(
         dbgs()
         << "Trying to eliminate MemoryDefs at the end of the function\n");
-    for (int I = MemDefs.size() - 1; I >= 0; I--) {
-      MemoryDef *Def = MemDefs[I];
-      if (SkipStores.contains(Def) || !isRemovable(Def->getMemoryInst()))
+    for (MemoryDef *Def : llvm::reverse(MemDefs)) {
+      if (SkipStores.contains(Def))
         continue;
 
       Instruction *DefI = Def->getMemoryInst();
-      auto DefLoc = getLocForWriteEx(DefI);
-      if (!DefLoc)
+      auto DefLoc = getLocForWrite(DefI);
+      if (!DefLoc || !isRemovable(DefI))
         continue;
 
       // NOTE: Currently eliminating writes at the end of a function is limited
@@ -1763,13 +1699,19 @@ struct DSEState {
   /// \returns true if \p Def is a no-op store, either because it
   /// directly stores back a loaded value or stores zero to a calloced object.
   bool storeIsNoop(MemoryDef *Def, const Value *DefUO) {
-    StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst());
-    MemSetInst *MemSet = dyn_cast<MemSetInst>(Def->getMemoryInst());
+    Instruction *DefI = Def->getMemoryInst();
+    StoreInst *Store = dyn_cast<StoreInst>(DefI);
+    MemSetInst *MemSet = dyn_cast<MemSetInst>(DefI);
     Constant *StoredConstant = nullptr;
     if (Store)
       StoredConstant = dyn_cast<Constant>(Store->getOperand(0));
-    if (MemSet)
+    else if (MemSet)
       StoredConstant = dyn_cast<Constant>(MemSet->getValue());
+    else
+      return false;
+
+    if (!isRemovable(DefI))
+      return false;
 
     if (StoredConstant && StoredConstant->isNullValue()) {
       auto *DefUOInst = dyn_cast<Instruction>(DefUO);
@@ -1902,7 +1844,7 @@ struct DSEState {
     bool Changed = false;
     for (auto OI : IOL) {
       Instruction *DeadI = OI.first;
-      MemoryLocation Loc = *getLocForWriteEx(DeadI);
+      MemoryLocation Loc = *getLocForWrite(DeadI);
       assert(isRemovable(DeadI) && "Expect only removable instruction");
 
       const Value *Ptr = Loc.Ptr->stripPointerCasts();
@@ -1925,9 +1867,14 @@ struct DSEState {
     LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs that write the "
                          "already existing value\n");
     for (auto *Def : MemDefs) {
-      if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def) ||
-          !isRemovable(Def->getMemoryInst()))
+      if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def))
         continue;
+
+      Instruction *DefInst = Def->getMemoryInst();
+      auto MaybeDefLoc = getLocForWrite(DefInst);
+      if (!MaybeDefLoc || !isRemovable(DefInst))
+        continue;
+
       MemoryDef *UpperDef;
       // To conserve compile-time, we avoid walking to the next clobbering def.
       // Instead, we just try to get the optimized access, if it exists. DSE
@@ -1939,17 +1886,14 @@ struct DSEState {
       if (!UpperDef || MSSA.isLiveOnEntryDef(UpperDef))
         continue;
 
-      Instruction *DefInst = Def->getMemoryInst();
       Instruction *UpperInst = UpperDef->getMemoryInst();
-      auto IsRedundantStore = [this, DefInst,
-                               UpperInst](MemoryLocation UpperLoc) {
+      auto IsRedundantStore = [&]() {
         if (DefInst->isIdenticalTo(UpperInst))
           return true;
         if (auto *MemSetI = dyn_cast<MemSetInst>(UpperInst)) {
           if (auto *SI = dyn_cast<StoreInst>(DefInst)) {
-            auto MaybeDefLoc = getLocForWriteEx(DefInst);
-            if (!MaybeDefLoc)
-              return false;
+            // MemSetInst must have a write location.
+            MemoryLocation UpperLoc = *getLocForWrite(UpperInst);
             int64_t InstWriteOffset = 0;
             int64_t DepWriteOffset = 0;
             auto OR = isOverwrite(UpperInst, DefInst, UpperLoc, *MaybeDefLoc,
@@ -1962,9 +1906,7 @@ struct DSEState {
         return false;
       };
 
-      auto MaybeUpperLoc = getLocForWriteEx(UpperInst);
-      if (!MaybeUpperLoc || !IsRedundantStore(*MaybeUpperLoc) ||
-          isReadClobber(*MaybeUpperLoc, DefInst))
+      if (!IsRedundantStore() || isReadClobber(*MaybeDefLoc, DefInst))
         continue;
       LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: " << *DefInst
                         << '\n');
@@ -1995,7 +1937,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
       MaybeKillingLoc = State.getLocForTerminator(KillingI).map(
           [](const std::pair<MemoryLocation, bool> &P) { return P.first; });
     else
-      MaybeKillingLoc = State.getLocForWriteEx(KillingI);
+      MaybeKillingLoc = State.getLocForWrite(KillingI);
 
     if (!MaybeKillingLoc) {
       LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
@@ -2059,7 +2001,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
       if (!DebugCounter::shouldExecute(MemorySSACounter))
         continue;
 
-      MemoryLocation DeadLoc = *State.getLocForWriteEx(DeadI);
+      MemoryLocation DeadLoc = *State.getLocForWrite(DeadI);
 
       if (IsMemTerm) {
         const Value *DeadUndObj = getUnderlyingObject(DeadLoc.Ptr);
@@ -2124,8 +2066,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
     }
 
     // Check if the store is a no-op.
-    if (!Shortend && isRemovable(KillingI) &&
-        State.storeIsNoop(KillingDef, KillingUndObj)) {
+    if (!Shortend && State.storeIsNoop(KillingDef, KillingUndObj)) {
       LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: " << *KillingI
                         << '\n');
       State.deleteDeadInstruction(KillingI);
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 90f71f7729a7..a24997dd3fd4 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -1366,8 +1366,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
           LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
           continue;
         }
-        if (auto *I = dyn_cast<Instruction>(V))
-          I->andIRFlags(&Inst);
+        if (auto *I = dyn_cast<Instruction>(V)) {
+          // If I being poison triggers UB, there is no need to drop those
+          // flags. Otherwise, only retain flags present on both I and Inst.
+          // TODO: Currently some fast-math flags are not treated as
+          // poison-generating even though they should. Until this is fixed,
+          // always retain flags present on both I and Inst for floating point
+          // instructions.
+          if (isa<FPMathOperator>(I) || (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
+            I->andIRFlags(&Inst);
+        }
         Inst.replaceAllUsesWith(V);
         salvageKnowledge(&Inst, &AC);
         removeMSSA(Inst);
diff --git a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index e54a270fb276..44017b555769 100644
--- a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -13,10 +13,12 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/FlattenCFG.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
@@ -24,11 +26,11 @@ using namespace llvm;
 #define DEBUG_TYPE "flattencfg"
 
 namespace {
-struct FlattenCFGPass : public FunctionPass {
+struct FlattenCFGLegacyPass : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
 public:
-  FlattenCFGPass() : FunctionPass(ID) {
-    initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
+  FlattenCFGLegacyPass() : FunctionPass(ID) {
+    initializeFlattenCFGLegacyPassPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override;
 
@@ -39,21 +41,10 @@ public:
 private:
   AliasAnalysis *AA;
 };
-}
-
-char FlattenCFGPass::ID = 0;
-INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
-                      false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
-                    false)
-
-// Public interface to the FlattenCFG pass
-FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
 
 /// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
 /// iterating until no more changes are made.
-static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
   bool Changed = false;
   bool LocalChange = true;
 
@@ -78,8 +69,22 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
   }
   return Changed;
 }
+} // namespace
 
-bool FlattenCFGPass::runOnFunction(Function &F) {
+char FlattenCFGLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(FlattenCFGLegacyPass, "flattencfg", "Flatten the CFG",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(FlattenCFGLegacyPass, "flattencfg", "Flatten the CFG",
+                    false, false)
+
+// Public interface to the FlattenCFG pass
+FunctionPass *llvm::createFlattenCFGPass() {
+  return new FlattenCFGLegacyPass();
+}
+
+bool FlattenCFGLegacyPass::runOnFunction(Function &F) {
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   bool EverChanged = false;
   // iterativelyFlattenCFG can make some blocks dead.
@@ -89,3 +94,15 @@ bool FlattenCFGPass::runOnFunction(Function &F) {
   }
   return EverChanged;
 }
+
+PreservedAnalyses FlattenCFGPass::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  bool EverChanged = false;
+  AliasAnalysis *AA = &AM.getResult<AAManager>(F);
+  // iterativelyFlattenCFG can make some blocks dead.
+  while (iterativelyFlattenCFG(F, AA)) {
+    removeUnreachableBlocks(F);
+    EverChanged = true;
+  }
+  return EverChanged ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 6f97f3e93123..bc792ca3d8da 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -107,11 +107,6 @@ static cl::opt<bool> ControlFlowHoisting(
     "licm-control-flow-hoisting", cl::Hidden, cl::init(false),
     cl::desc("Enable control flow (and PHI) hoisting in LICM"));
 
-static cl::opt<unsigned> HoistSinkColdnessThreshold(
-    "licm-coldness-threshold", cl::Hidden, cl::init(4),
-    cl::desc("Relative coldness Threshold of hoisting/sinking destination "
-             "block for LICM to be considered beneficial"));
-
 static cl::opt<uint32_t> MaxNumUsesTraversed(
     "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
     cl::desc("Max num uses visited for identifying load "
@@ -819,35 +814,6 @@ public:
 };
 } // namespace
 
-// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only
-// only worthwhile if the destination block is actually colder than current
-// block.
-static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock,
-                                 OptimizationRemarkEmitter *ORE,
-                                 BlockFrequencyInfo *BFI) {
-  // Check block frequency only when runtime profile is available
-  // to avoid pathological cases. With static profile, lean towards
-  // hosting because it helps canonicalize the loop for vectorizer.
-  if (!DstBlock->getParent()->hasProfileData())
-    return true;
-
-  if (!HoistSinkColdnessThreshold || !BFI)
-    return true;
-
-  BasicBlock *SrcBlock = I.getParent();
-  if (BFI->getBlockFreq(DstBlock).getFrequency() / HoistSinkColdnessThreshold >
-      BFI->getBlockFreq(SrcBlock).getFrequency()) {
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "SinkHoistInst", &I)
-             << "failed to sink or hoist instruction because containing block "
-                "has lower frequency than destination block";
-    });
-    return false;
-  }
-
-  return true;
-}
-
 /// Walk the specified region of the CFG (defined by all blocks dominated by
 /// the specified block, and that are in the current loop) in depth first
 /// order w.r.t the DominatorTree.  This allows us to visit definitions before
@@ -909,7 +875,6 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
       if (CurLoop->hasLoopInvariantOperands(&I) &&
           canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/ nullptr, MSSAU,
                              true, &Flags, ORE) &&
-          worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) &&
           isSafeToExecuteUnconditionally(
               I, DT, TLI, CurLoop, SafetyInfo, ORE,
               CurLoop->getLoopPreheader()->getTerminator())) {
@@ -1741,7 +1706,6 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
   // First check if I is worth sinking for all uses. Sink only when it is worth
   // across all uses.
   SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
-  SmallVector<PHINode *, 8> ExitPNs;
   for (auto *UI : Users) {
     auto *User = cast<Instruction>(UI);
 
@@ -1751,14 +1715,6 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
     PHINode *PN = cast<PHINode>(User);
     assert(ExitBlockSet.count(PN->getParent()) &&
            "The LCSSA PHI is not in an exit block!");
-    if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) {
-      return Changed;
-    }
-
-    ExitPNs.push_back(PN);
-  }
-
-  for (auto *PN : ExitPNs) {
 
     // The PHI must be trivially replaceable.
     Instruction *New = sinkThroughTriviallyReplaceablePHI(
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 77d76609c926..57e36e5b9b90 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -224,8 +224,8 @@ bool LoopDataPrefetch::run() {
   bool MadeChange = false;
 
   for (Loop *I : *LI)
-    for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
-      MadeChange |= runOnLoop(*L);
+    for (Loop *L : depth_first(I))
+      MadeChange |= runOnLoop(L);
 
   return MadeChange;
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 42da86a9ecf5..5d00fa56e888 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -786,9 +786,9 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
     Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
     const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
     if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
-                                MaybeAlign(HeadStore->getAlignment()),
-                                StoredVal, HeadStore, AdjacentStores, StoreEv,
-                                BECount, IsNegStride)) {
+                                MaybeAlign(HeadStore->getAlign()), StoredVal,
+                                HeadStore, AdjacentStores, StoreEv, BECount,
+                                IsNegStride)) {
       TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
       Changed = true;
     }
@@ -967,12 +967,22 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
                       << "\n");
 
     if (PositiveStrideSCEV != MemsetSizeSCEV) {
-      // TODO: folding can be done to the SCEVs
-      // The folding is to fold expressions that is covered by the loop guard
-      // at loop entry. After the folding, compare again and proceed
-      // optimization if equal.
-      LLVM_DEBUG(dbgs() << "  SCEV don't match, abort\n");
-      return false;
+      // If an expression is covered by the loop guard, compare again and
+      // proceed with optimization if equal.
+      const SCEV *FoldedPositiveStride =
+          SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
+      const SCEV *FoldedMemsetSize =
+          SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
+
+      LLVM_DEBUG(dbgs() << "  Try to fold SCEV based on loop guard\n"
+                        << "    FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
+                        << "    FoldedPositiveStride: " << *FoldedPositiveStride
+                        << "\n");
+
+      if (FoldedPositiveStride != FoldedMemsetSize) {
+        LLVM_DEBUG(dbgs() << "  SCEV don't match, abort\n");
+        return false;
+      }
     }
   }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index 56d66b93dd69..9d22eceb987f 100644
--- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -1456,16 +1456,12 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) {
   }
 
   // Remove instructions associated with non-base iterations.
-  for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
-       J != JE;) {
-    unsigned I = Uses[&*J].find_first();
+  for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(*Header))) {
+    unsigned I = Uses[&Inst].find_first();
     if (I > 0 && I < IL_All) {
-      LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
-      J++->eraseFromParent();
-      continue;
+      LLVM_DEBUG(dbgs() << "LRR: removing: " << Inst << "\n");
+      Inst.eraseFromParent();
     }
-
-    ++J;
   }
 
   // Rewrite each BaseInst using SCEV.
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index a9a2266e1196..798af48c2337 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6011,7 +6011,7 @@ struct SCEVDbgValueBuilder {
     // See setFinalExpression: prepend our opcodes on the start of any old
     // expression opcodes.
     assert(!DI.hasArgList());
-    llvm::SmallVector<uint64_t, 6> FinalExpr(Expr.begin() + 2, Expr.end());
+    llvm::SmallVector<uint64_t, 6> FinalExpr(llvm::drop_begin(Expr, 2));
     auto *NewExpr =
         DIExpression::prependOpcodes(OldExpr, FinalExpr, /*StackValue*/ true);
     DI.setExpression(NewExpr);
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 39c8b65968aa..893928fb0560 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1136,6 +1136,31 @@ static LoopUnrollResult tryToUnrollLoop(
   TransformationMode TM = hasUnrollTransformation(L);
   if (TM & TM_Disable)
     return LoopUnrollResult::Unmodified;
+
+  // If this loop isn't forced to be unrolled, avoid unrolling it when the
+  // parent loop has an explicit unroll-and-jam pragma. This is to prevent
+  // automatic unrolling from interfering with the user requested
+  // transformation.
+  Loop *ParentL = L->getParentLoop();
+  if (ParentL != NULL &&
+      hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser &&
+      hasUnrollTransformation(L) != TM_ForcedByUser) {
+    LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
+                      << " llvm.loop.unroll_and_jam.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  // If this loop isn't forced to be unrolled, avoid unrolling it when the
+  // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
+  // unrolling from interfering with the user requested transformation.
+  if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser &&
+      hasUnrollTransformation(L) != TM_ForcedByUser) {
+    LLVM_DEBUG(
+        dbgs()
+        << "  Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
   if (!L->isLoopSimplifyForm()) {
     LLVM_DEBUG(
         dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n");
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 91215cd19e2b..10a8742940b1 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -638,6 +638,7 @@ class NewGVN {
   BitVector TouchedInstructions;
 
   DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
+  mutable DenseMap<const IntrinsicInst *, const Value *> IntrinsicInstPred;
 
 #ifndef NDEBUG
   // Debugging for how many times each block and instruction got processed.
@@ -794,7 +795,7 @@ private:
                                                  BasicBlock *PHIBlock) const;
   const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
   ExprResult performSymbolicCmpEvaluation(Instruction *) const;
-  ExprResult performSymbolicPredicateInfoEvaluation(Instruction *) const;
+  ExprResult performSymbolicPredicateInfoEvaluation(IntrinsicInst *) const;
 
   // Congruence finding.
   bool someEquivalentDominates(const Instruction *, const Instruction *) const;
@@ -815,6 +816,8 @@ private:
   // Ranking
   unsigned int getRank(const Value *) const;
   bool shouldSwapOperands(const Value *, const Value *) const;
+  bool shouldSwapOperandsForIntrinsic(const Value *, const Value *,
+                                      const IntrinsicInst *I) const;
 
   // Reachability handling.
   void updateReachableEdge(BasicBlock *, BasicBlock *);
@@ -1552,7 +1555,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
 }
 
 NewGVN::ExprResult
-NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
+NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const {
   auto *PI = PredInfo->getPredicateInfoFor(I);
   if (!PI)
     return ExprResult::none();
@@ -1572,7 +1575,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
   Value *AdditionallyUsedValue = CmpOp0;
 
   // Sort the ops.
-  if (shouldSwapOperands(FirstOp, SecondOp)) {
+  if (shouldSwapOperandsForIntrinsic(FirstOp, SecondOp, I)) {
     std::swap(FirstOp, SecondOp);
     Predicate = CmpInst::getSwappedPredicate(Predicate);
     AdditionallyUsedValue = CmpOp1;
@@ -1598,7 +1601,7 @@ NewGVN::ExprResult NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
     // Intrinsics with the returned attribute are copies of arguments.
     if (auto *ReturnedValue = II->getReturnedArgOperand()) {
       if (II->getIntrinsicID() == Intrinsic::ssa_copy)
-        if (auto Res = performSymbolicPredicateInfoEvaluation(I))
+        if (auto Res = performSymbolicPredicateInfoEvaluation(II))
           return Res;
       return ExprResult::some(createVariableOrConstant(ReturnedValue));
     }
@@ -2951,6 +2954,7 @@ void NewGVN::cleanupTables() {
   PredicateToUsers.clear();
   MemoryToUsers.clear();
   RevisitOnReachabilityChange.clear();
+  IntrinsicInstPred.clear();
 }
 
 // Assign local DFS number mapping to instructions, and leave space for Value
@@ -4152,6 +4156,29 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
   return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
 }
 
+bool NewGVN::shouldSwapOperandsForIntrinsic(const Value *A, const Value *B,
+                                            const IntrinsicInst *I) const {
+  auto LookupResult = IntrinsicInstPred.find(I);
+  if (shouldSwapOperands(A, B)) {
+    if (LookupResult == IntrinsicInstPred.end())
+      IntrinsicInstPred.insert({I, B});
+    else
+      LookupResult->second = B;
+    return true;
+  }
+
+  if (LookupResult != IntrinsicInstPred.end()) {
+    auto *SeenPredicate = LookupResult->second;
+    if (SeenPredicate) {
+      if (SeenPredicate == B)
+        return true;
+      else
+        LookupResult->second = nullptr;
+    }
+  }
+  return false;
+}
+
 namespace {
 
 class NewGVNLegacyPass : public FunctionPass {
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 2d3490b2d29e..e12eca0ed287 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1359,16 +1359,6 @@ static constexpr Attribute::AttrKind FnAttrsToStrip[] =
    Attribute::InaccessibleMemOrArgMemOnly,
    Attribute::NoSync, Attribute::NoFree};
 
-// List of all parameter and return attributes which must be stripped when
-// lowering from the abstract machine model.  Note that we list attributes
-// here which aren't valid as return attributes, that is okay.  There are
-// also some additional attributes with arguments which are handled
-// explicitly and are not in this list.
-static constexpr Attribute::AttrKind ParamAttrsToStrip[] =
-  {Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly,
-   Attribute::NoAlias, Attribute::NoFree};
-
-
 // Create new attribute set containing only attributes which can be transferred
 // from original call to the safepoint.
 static AttributeList legalizeCallAttributes(LLVMContext &Ctx,
@@ -2650,24 +2640,19 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
   return !Records.empty();
 }
 
-// Handles both return values and arguments for Functions and calls.
-template <typename AttrHolder>
-static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
-                                      unsigned Index) {
+// List of all parameter and return attributes which must be stripped when
+// lowering from the abstract machine model.  Note that we list attributes
+// here which aren't valid as return attributes, that is okay.
+static AttrBuilder getParamAndReturnAttributesToRemove() {
   AttrBuilder R;
-  AttributeSet AS = AH.getAttributes().getAttributes(Index);
-  if (AS.getDereferenceableBytes())
-    R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
-                                  AS.getDereferenceableBytes()));
-  if (AS.getDereferenceableOrNullBytes())
-    R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
-                                  AS.getDereferenceableOrNullBytes()));
-  for (auto Attr : ParamAttrsToStrip)
-    if (AS.hasAttribute(Attr))
-      R.addAttribute(Attr);
-
-  if (!R.empty())
-    AH.setAttributes(AH.getAttributes().removeAttributesAtIndex(Ctx, Index, R));
+  R.addDereferenceableAttr(1);
+  R.addDereferenceableOrNullAttr(1);
+  R.addAttribute(Attribute::ReadNone);
+  R.addAttribute(Attribute::ReadOnly);
+  R.addAttribute(Attribute::WriteOnly);
+  R.addAttribute(Attribute::NoAlias);
+  R.addAttribute(Attribute::NoFree);
+  return R;
 }
 
 static void stripNonValidAttributesFromPrototype(Function &F) {
@@ -2683,13 +2668,13 @@ static void stripNonValidAttributesFromPrototype(Function &F) {
     return;
   }
 
+  AttrBuilder R = getParamAndReturnAttributesToRemove();
   for (Argument &A : F.args())
     if (isa<PointerType>(A.getType()))
-      RemoveNonValidAttrAtIndex(Ctx, F,
-                                A.getArgNo() + AttributeList::FirstArgIndex);
+      F.removeParamAttrs(A.getArgNo(), R);
 
   if (isa<PointerType>(F.getReturnType()))
-    RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
+    F.removeRetAttrs(R);
 
   for (auto Attr : FnAttrsToStrip)
     F.removeFnAttr(Attr);
@@ -2757,13 +2742,13 @@ static void stripNonValidDataFromBody(Function &F) {
 
     stripInvalidMetadataFromInstruction(I);
 
+    AttrBuilder R = getParamAndReturnAttributesToRemove();
     if (auto *Call = dyn_cast<CallBase>(&I)) {
       for (int i = 0, e = Call->arg_size(); i != e; i++)
         if (isa<PointerType>(Call->getArgOperand(i)->getType()))
-          RemoveNonValidAttrAtIndex(Ctx, *Call,
-                                    i + AttributeList::FirstArgIndex);
+          Call->removeParamAttrs(i, R);
       if (isa<PointerType>(Call->getType()))
-        RemoveNonValidAttrAtIndex(Ctx, *Call, AttributeList::ReturnIndex);
+        Call->removeRetAttrs(R);
     }
   }
 
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index 28e00c873361..ff2f8a25f379 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -101,8 +101,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
   Constant *Const = nullptr;
   if (V->getType()->isStructTy()) {
     std::vector<ValueLatticeElement> IVs = Solver.getStructLatticeValueFor(V);
-    if (any_of(IVs,
-               [](const ValueLatticeElement &LV) { return isOverdefined(LV); }))
+    if (llvm::any_of(IVs, isOverdefined))
       return false;
     std::vector<Constant *> ConstVals;
     auto *ST = cast<StructType>(V->getType());
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index a041af0d70d0..f9650efc051f 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -54,7 +54,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeMakeGuardsExplicitLegacyPassPass(Registry);
   initializeGVNHoistLegacyPassPass(Registry);
   initializeGVNSinkLegacyPassPass(Registry);
-  initializeFlattenCFGPassPass(Registry);
+  initializeFlattenCFGLegacyPassPass(Registry);
   initializeIRCELegacyPassPass(Registry);
   initializeIndVarSimplifyLegacyPassPass(Registry);
   initializeInferAddressSpacesPass(Registry);
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index ffa2f9adb978..d23925042b0a 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -648,13 +648,13 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) {
   Value *Current = V;
   // ExtInsts is built in the use-def order. Therefore, we apply them to V
   // in the reversed order.
-  for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
+  for (CastInst *I : llvm::reverse(ExtInsts)) {
     if (Constant *C = dyn_cast<Constant>(Current)) {
       // If Current is a constant, apply s/zext using ConstantExpr::getCast.
       // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
-      Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
+      Current = ConstantExpr::getCast(I->getOpcode(), C, I->getType());
     } else {
-      Instruction *Ext = (*I)->clone();
+      Instruction *Ext = I->clone();
       Ext->setOperand(0, Current);
       Ext->insertBefore(IP);
       Current = Ext;
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
new file mode 100644
index 000000000000..dfb9f608eab2
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -0,0 +1,942 @@
+//===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// ExtTSP - layout of basic blocks with i-cache optimization.
+//
+// The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
+// optimizing jump locality and thus processor I-cache utilization. This is
+// achieved via increasing the number of fall-through jumps and co-locating
+// frequently executed nodes together. The name follows the underlying
+// optimization problem, Extended-TSP, which is a generalization of classical
+// (maximum) Traveling Salesmen Problem.
+//
+// The algorithm is a greedy heuristic that works with chains (ordered lists)
+// of basic blocks. Initially all chains are isolated basic blocks. On every
+// iteration, we pick a pair of chains whose merging yields the biggest increase
+// in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
+// A pair of chains giving the maximum gain is merged into a new chain. The
+// procedure stops when there is only one chain left, or when merging does not
+// increase ExtTSP. In the latter case, the remaining chains are sorted by
+// density in the decreasing order.
+//
+// An important aspect is the way two chains are merged. Unlike earlier
+// algorithms (e.g., based on the approach of Pettis-Hansen), two
+// chains, X and Y, are first split into three, X1, X2, and Y. Then we
+// consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
+// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
+// This improves the quality of the final result (the search space is larger)
+// while keeping the implementation sufficiently fast.
+//
+// Reference:
+//   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
+//     IEEE Transactions on Computers, 2020
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CodeLayout.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+#define DEBUG_TYPE "code-layout"
+
+// Algorithm-specific constants. The values are tuned for the best performance
+// of large-scale front-end bound binaries.
+static cl::opt<double>
+    ForwardWeight("ext-tsp-forward-weight", cl::Hidden, cl::init(0.1),
+                  cl::desc("The weight of forward jumps for ExtTSP value"));
+
+static cl::opt<double>
+    BackwardWeight("ext-tsp-backward-weight", cl::Hidden, cl::init(0.1),
+                   cl::desc("The weight of backward jumps for ExtTSP value"));
+
+static cl::opt<unsigned> ForwardDistance(
+    "ext-tsp-forward-distance", cl::Hidden, cl::init(1024),
+    cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
+
+static cl::opt<unsigned> BackwardDistance(
+    "ext-tsp-backward-distance", cl::Hidden, cl::init(640),
+    cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
+
+// The maximum size of a chain for splitting. Larger values of the threshold
+// may yield better quality at the cost of worsen run-time.
+static cl::opt<unsigned> ChainSplitThreshold(
+    "ext-tsp-chain-split-threshold", cl::Hidden, cl::init(128),
+    cl::desc("The maximum size of a chain to apply splitting"));
+
+// The option enables splitting (large) chains along in-coming and out-going
+// jumps. This typically results in a better quality.
+static cl::opt<bool> EnableChainSplitAlongJumps(
+    "ext-tsp-enable-chain-split-along-jumps", cl::Hidden, cl::init(true),
+    cl::desc("The maximum size of a chain to apply splitting"));
+
+namespace {
+
+// Epsilon for comparison of doubles.
+constexpr double EPS = 1e-8;
+
+// Compute the Ext-TSP score for a jump between a given pair of blocks,
+// using their sizes, (estimated) addresses and the jump execution count.
+double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
+                   uint64_t Count) {
+  // Fallthrough
+  if (SrcAddr + SrcSize == DstAddr) {
+    // Assume that FallthroughWeight = 1.0 after normalization
+    return static_cast<double>(Count);
+  }
+  // Forward
+  if (SrcAddr + SrcSize < DstAddr) {
+    const auto Dist = DstAddr - (SrcAddr + SrcSize);
+    if (Dist <= ForwardDistance) {
+      double Prob = 1.0 - static_cast<double>(Dist) / ForwardDistance;
+      return ForwardWeight * Prob * Count;
+    }
+    return 0;
+  }
+  // Backward
+  const auto Dist = SrcAddr + SrcSize - DstAddr;
+  if (Dist <= BackwardDistance) {
+    double Prob = 1.0 - static_cast<double>(Dist) / BackwardDistance;
+    return BackwardWeight * Prob * Count;
+  }
+  return 0;
+}
+
+/// A type of merging two chains, X and Y. The former chain is split into
+/// X1 and X2 and then concatenated with Y in the order specified by the type.
+enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y };
+
+/// The gain of merging two chains, that is, the Ext-TSP score of the merge
+/// together with the corresponfiding merge 'type' and 'offset'.
+class MergeGainTy {
+public:
+  explicit MergeGainTy() {}
+  explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType)
+      : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
+
+  double score() const { return Score; }
+
+  size_t mergeOffset() const { return MergeOffset; }
+
+  MergeTypeTy mergeType() const { return MergeType; }
+
+  // Returns 'true' iff Other is preferred over this.
+  bool operator<(const MergeGainTy &Other) const {
+    return (Other.Score > EPS && Other.Score > Score + EPS);
+  }
+
+  // Update the current gain if Other is preferred over this.
+  void updateIfLessThan(const MergeGainTy &Other) {
+    if (*this < Other)
+      *this = Other;
+  }
+
+private:
+  double Score{-1.0};
+  size_t MergeOffset{0};
+  MergeTypeTy MergeType{MergeTypeTy::X_Y};
+};
+
+class Block;
+class Jump;
+class Chain;
+class ChainEdge;
+
+/// A node in the graph, typically corresponding to a basic block in CFG.
+class Block {
+public:
+  Block(const Block &) = delete;
+  Block(Block &&) = default;
+  Block &operator=(const Block &) = delete;
+  Block &operator=(Block &&) = default;
+
+  // The original index of the block in CFG.
+  size_t Index{0};
+  // The index of the block in the current chain.
+  size_t CurIndex{0};
+  // Size of the block in the binary.
+  uint64_t Size{0};
+  // Execution count of the block in the profile data.
+  uint64_t ExecutionCount{0};
+  // Current chain of the node.
+  Chain *CurChain{nullptr};
+  // An offset of the block in the current chain.
+  mutable uint64_t EstimatedAddr{0};
+  // Forced successor of the block in CFG.
+  Block *ForcedSucc{nullptr};
+  // Forced predecessor of the block in CFG.
+  Block *ForcedPred{nullptr};
+  // Outgoing jumps from the block.
+  std::vector<Jump *> OutJumps;
+  // Incoming jumps to the block.
+  std::vector<Jump *> InJumps;
+
+public:
+  explicit Block(size_t Index, uint64_t Size_, uint64_t EC)
+      : Index(Index), Size(Size_), ExecutionCount(EC) {}
+  bool isEntry() const { return Index == 0; }
+};
+
+/// An arc in the graph, typically corresponding to a jump between two blocks.
+class Jump {
+public:
+  Jump(const Jump &) = delete;
+  Jump(Jump &&) = default;
+  Jump &operator=(const Jump &) = delete;
+  Jump &operator=(Jump &&) = default;
+
+  // Source block of the jump.
+  Block *Source;
+  // Target block of the jump.
+  Block *Target;
+  // Execution count of the arc in the profile data.
+  uint64_t ExecutionCount{0};
+
+public:
+  explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount)
+      : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
+};
+
+/// A chain (ordered sequence) of blocks.
+class Chain {
+public:
+  Chain(const Chain &) = delete;
+  Chain(Chain &&) = default;
+  Chain &operator=(const Chain &) = delete;
+  Chain &operator=(Chain &&) = default;
+
+  explicit Chain(uint64_t Id, Block *Block)
+      : Id(Id), Score(0), Blocks(1, Block) {}
+
+  uint64_t id() const { return Id; }
+
+  bool isEntry() const { return Blocks[0]->Index == 0; }
+
+  double score() const { return Score; }
+
+  void setScore(double NewScore) { Score = NewScore; }
+
+  const std::vector<Block *> &blocks() const { return Blocks; }
+
+  const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const {
+    return Edges;
+  }
+
+  ChainEdge *getEdge(Chain *Other) const {
+    for (auto It : Edges) {
+      if (It.first == Other)
+        return It.second;
+    }
+    return nullptr;
+  }
+
+  void removeEdge(Chain *Other) {
+    auto It = Edges.begin();
+    while (It != Edges.end()) {
+      if (It->first == Other) {
+        Edges.erase(It);
+        return;
+      }
+      It++;
+    }
+  }
+
+  void addEdge(Chain *Other, ChainEdge *Edge) {
+    Edges.push_back(std::make_pair(Other, Edge));
+  }
+
+  void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) {
+    Blocks = MergedBlocks;
+    // Update the block's chains
+    for (size_t Idx = 0; Idx < Blocks.size(); Idx++) {
+      Blocks[Idx]->CurChain = this;
+      Blocks[Idx]->CurIndex = Idx;
+    }
+  }
+
+  void mergeEdges(Chain *Other);
+
+  void clear() {
+    Blocks.clear();
+    Blocks.shrink_to_fit();
+    Edges.clear();
+    Edges.shrink_to_fit();
+  }
+
+private:
+  // Unique chain identifier.
+  uint64_t Id;
+  // Cached ext-tsp score for the chain.
+  double Score;
+  // Blocks of the chain.
+  std::vector<Block *> Blocks;
+  // Adjacent chains and corresponding edges (lists of jumps).
+  std::vector<std::pair<Chain *, ChainEdge *>> Edges;
+};
+
+/// An edge in CFG representing jumps between two chains.
+/// When blocks are merged into chains, the edges are combined too so that
+/// there is always at most one edge between a pair of chains
+class ChainEdge {
+public:
+  ChainEdge(const ChainEdge &) = delete;
+  ChainEdge(ChainEdge &&) = default;
+  ChainEdge &operator=(const ChainEdge &) = delete;
+  ChainEdge &operator=(ChainEdge &&) = default;
+
+  explicit ChainEdge(Jump *Jump)
+      : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
+        Jumps(1, Jump) {}
+
+  const std::vector<Jump *> &jumps() const { return Jumps; }
+
+  void changeEndpoint(Chain *From, Chain *To) {
+    if (From == SrcChain)
+      SrcChain = To;
+    if (From == DstChain)
+      DstChain = To;
+  }
+
+  void appendJump(Jump *Jump) { Jumps.push_back(Jump); }
+
+  void moveJumps(ChainEdge *Other) {
+    Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
+    Other->Jumps.clear();
+    Other->Jumps.shrink_to_fit();
+  }
+
+  bool hasCachedMergeGain(Chain *Src, Chain *Dst) const {
+    return Src == SrcChain ? CacheValidForward : CacheValidBackward;
+  }
+
+  MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const {
+    return Src == SrcChain ? CachedGainForward : CachedGainBackward;
+  }
+
+  void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) {
+    if (Src == SrcChain) {
+      CachedGainForward = MergeGain;
+      CacheValidForward = true;
+    } else {
+      CachedGainBackward = MergeGain;
+      CacheValidBackward = true;
+    }
+  }
+
+  void invalidateCache() {
+    CacheValidForward = false;
+    CacheValidBackward = false;
+  }
+
+private:
+  // Source chain.
+  Chain *SrcChain{nullptr};
+  // Destination chain.
+  Chain *DstChain{nullptr};
+  // Original jumps in the binary with correspinding execution counts.
+  std::vector<Jump *> Jumps;
+  // Cached ext-tsp value for merging the pair of chains.
+  // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
+  // we store both values here.
+  MergeGainTy CachedGainForward;
+  MergeGainTy CachedGainBackward;
+  // Whether the cached value must be recomputed.
+  bool CacheValidForward{false};
+  bool CacheValidBackward{false};
+};
+
+void Chain::mergeEdges(Chain *Other) {
+  assert(this != Other && "cannot merge a chain with itself");
+
+  // Update edges adjacent to chain Other
+  for (auto EdgeIt : Other->Edges) {
+    const auto DstChain = EdgeIt.first;
+    const auto DstEdge = EdgeIt.second;
+    const auto TargetChain = DstChain == Other ? this : DstChain;
+    auto CurEdge = getEdge(TargetChain);
+    if (CurEdge == nullptr) {
+      DstEdge->changeEndpoint(Other, this);
+      this->addEdge(TargetChain, DstEdge);
+      if (DstChain != this && DstChain != Other) {
+        DstChain->addEdge(this, DstEdge);
+      }
+    } else {
+      CurEdge->moveJumps(DstEdge);
+    }
+    // Cleanup leftover edge
+    if (DstChain != Other) {
+      DstChain->removeEdge(Other);
+    }
+  }
+}
+
+using BlockIter = std::vector<Block *>::const_iterator;
+
+/// A wrapper around three chains of blocks; it is used to avoid extra
+/// instantiation of the vectors.
+class MergedChain {
+public:
+  MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(),
+              BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(),
+              BlockIter End3 = BlockIter())
+      : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
+        End3(End3) {}
+
+  template <typename F> void forEach(const F &Func) const {
+    for (auto It = Begin1; It != End1; It++)
+      Func(*It);
+    for (auto It = Begin2; It != End2; It++)
+      Func(*It);
+    for (auto It = Begin3; It != End3; It++)
+      Func(*It);
+  }
+
+  std::vector<Block *> getBlocks() const {
+    std::vector<Block *> Result;
+    Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
+                   std::distance(Begin3, End3));
+    Result.insert(Result.end(), Begin1, End1);
+    Result.insert(Result.end(), Begin2, End2);
+    Result.insert(Result.end(), Begin3, End3);
+    return Result;
+  }
+
+  const Block *getFirstBlock() const { return *Begin1; }
+
+private:
+  BlockIter Begin1;
+  BlockIter End1;
+  BlockIter Begin2;
+  BlockIter End2;
+  BlockIter Begin3;
+  BlockIter End3;
+};
+
+/// The implementation of the ExtTSP algorithm.
+class ExtTSPImpl {
+  using EdgeT = std::pair<uint64_t, uint64_t>;
+  using EdgeCountMap = DenseMap<EdgeT, uint64_t>;
+
+public:
+  ExtTSPImpl(size_t NumNodes, const std::vector<uint64_t> &NodeSizes,
+             const std::vector<uint64_t> &NodeCounts,
+             const EdgeCountMap &EdgeCounts)
+      : NumNodes(NumNodes) {
+    initialize(NodeSizes, NodeCounts, EdgeCounts);
+  }
+
+  /// Run the algorithm and return an optimized ordering of blocks.
+  void run(std::vector<uint64_t> &Result) {
+    // Pass 1: Merge blocks with their mutually forced successors
+    mergeForcedPairs();
+
+    // Pass 2: Merge pairs of chains while improving the ExtTSP objective
+    mergeChainPairs();
+
+    // Pass 3: Merge cold blocks to reduce code size
+    mergeColdChains();
+
+    // Collect blocks from all chains
+    concatChains(Result);
+  }
+
+private:
+  /// Initialize the algorithm's data structures.
+  void initialize(const std::vector<uint64_t> &NodeSizes,
+                  const std::vector<uint64_t> &NodeCounts,
+                  const EdgeCountMap &EdgeCounts) {
+    // Initialize blocks
+    AllBlocks.reserve(NumNodes);
+    for (uint64_t Node = 0; Node < NumNodes; Node++) {
+      uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
+      uint64_t ExecutionCount = NodeCounts[Node];
+      // The execution count of the entry block is set to at least 1
+      if (Node == 0 && ExecutionCount == 0)
+        ExecutionCount = 1;
+      AllBlocks.emplace_back(Node, Size, ExecutionCount);
+    }
+
+    // Initialize jumps between blocks
+    SuccNodes = std::vector<std::vector<uint64_t>>(NumNodes);
+    PredNodes = std::vector<std::vector<uint64_t>>(NumNodes);
+    AllJumps.reserve(EdgeCounts.size());
+    for (auto It : EdgeCounts) {
+      auto Pred = It.first.first;
+      auto Succ = It.first.second;
+      // Ignore self-edges
+      if (Pred == Succ)
+        continue;
+
+      SuccNodes[Pred].push_back(Succ);
+      PredNodes[Succ].push_back(Pred);
+      auto ExecutionCount = It.second;
+      if (ExecutionCount > 0) {
+        auto &Block = AllBlocks[Pred];
+        auto &SuccBlock = AllBlocks[Succ];
+        AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount);
+        SuccBlock.InJumps.push_back(&AllJumps.back());
+        Block.OutJumps.push_back(&AllJumps.back());
+      }
+    }
+
+    // Initialize chains
+    AllChains.reserve(NumNodes);
+    HotChains.reserve(NumNodes);
+    for (auto &Block : AllBlocks) {
+      AllChains.emplace_back(Block.Index, &Block);
+      Block.CurChain = &AllChains.back();
+      if (Block.ExecutionCount > 0) {
+        HotChains.push_back(&AllChains.back());
+      }
+    }
+
+    // Initialize chain edges
+    AllEdges.reserve(AllJumps.size());
+    for (auto &Block : AllBlocks) {
+      for (auto &Jump : Block.OutJumps) {
+        const auto SuccBlock = Jump->Target;
+        auto CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain);
+        // this edge is already present in the graph
+        if (CurEdge != nullptr) {
+          assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr);
+          CurEdge->appendJump(Jump);
+          continue;
+        }
+        // this is a new edge
+        AllEdges.emplace_back(Jump);
+        Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back());
+        SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back());
+      }
+    }
+  }
+
+  /// For a pair of blocks, A and B, block B is the forced successor of A,
+  /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
+  /// to B are from A. Such blocks should be adjacent in the optimal ordering;
+  /// the method finds and merges such pairs of blocks.
+  void mergeForcedPairs() {
+    // Find fallthroughs based on edge weights
+    for (auto &Block : AllBlocks) {
+      if (SuccNodes[Block.Index].size() == 1 &&
+          PredNodes[SuccNodes[Block.Index][0]].size() == 1 &&
+          SuccNodes[Block.Index][0] != 0) {
+        size_t SuccIndex = SuccNodes[Block.Index][0];
+        Block.ForcedSucc = &AllBlocks[SuccIndex];
+        AllBlocks[SuccIndex].ForcedPred = &Block;
+      }
+    }
+
+    // There might be 'cycles' in the forced dependencies, since profile
+    // data isn't 100% accurate. Typically this is observed in loops, when the
+    // loop edges are the hottest successors for the basic blocks of the loop.
+    // Break the cycles by choosing the block with the smallest index as the
+    // head. This helps to keep the original order of the loops, which likely
+    // have already been rotated in the optimized manner.
+    for (auto &Block : AllBlocks) {
+      if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr)
+        continue;
+
+      auto SuccBlock = Block.ForcedSucc;
+      while (SuccBlock != nullptr && SuccBlock != &Block) {
+        SuccBlock = SuccBlock->ForcedSucc;
+      }
+      if (SuccBlock == nullptr)
+        continue;
+      // Break the cycle
+      AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr;
+      Block.ForcedPred = nullptr;
+    }
+
+    // Merge blocks with their fallthrough successors
+    for (auto &Block : AllBlocks) {
+      if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) {
+        auto CurBlock = &Block;
+        while (CurBlock->ForcedSucc != nullptr) {
+          const auto NextBlock = CurBlock->ForcedSucc;
+          mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y);
+          CurBlock = NextBlock;
+        }
+      }
+    }
+  }
+
+  /// Merge pairs of chains while improving the ExtTSP objective.
+  void mergeChainPairs() {
+    /// Deterministically compare pairs of chains
+    auto compareChainPairs = [](const Chain *A1, const Chain *B1,
+                                const Chain *A2, const Chain *B2) {
+      if (A1 != A2)
+        return A1->id() < A2->id();
+      return B1->id() < B2->id();
+    };
+
+    while (HotChains.size() > 1) {
+      Chain *BestChainPred = nullptr;
+      Chain *BestChainSucc = nullptr;
+      auto BestGain = MergeGainTy();
+      // Iterate over all pairs of chains
+      for (auto ChainPred : HotChains) {
+        // Get candidates for merging with the current chain
+        for (auto EdgeIter : ChainPred->edges()) {
+          auto ChainSucc = EdgeIter.first;
+          auto ChainEdge = EdgeIter.second;
+          // Ignore loop edges
+          if (ChainPred == ChainSucc)
+            continue;
+
+          // Compute the gain of merging the two chains
+          auto CurGain = getBestMergeGain(ChainPred, ChainSucc, ChainEdge);
+          if (CurGain.score() <= EPS)
+            continue;
+
+          if (BestGain < CurGain ||
+              (std::abs(CurGain.score() - BestGain.score()) < EPS &&
+               compareChainPairs(ChainPred, ChainSucc, BestChainPred,
+                                 BestChainSucc))) {
+            BestGain = CurGain;
+            BestChainPred = ChainPred;
+            BestChainSucc = ChainSucc;
+          }
+        }
+      }
+
+      // Stop merging when there is no improvement
+      if (BestGain.score() <= EPS)
+        break;
+
+      // Merge the best pair of chains
+      mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
+                  BestGain.mergeType());
+    }
+  }
+
+  /// Merge cold blocks to reduce code size.
+  void mergeColdChains() {
+    for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
+      // Iterating over neighbors in the reverse order to make sure original
+      // fallthrough jumps are merged first
+      size_t NumSuccs = SuccNodes[SrcBB].size();
+      for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
+        auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
+        auto SrcChain = AllBlocks[SrcBB].CurChain;
+        auto DstChain = AllBlocks[DstBB].CurChain;
+        if (SrcChain != DstChain && !DstChain->isEntry() &&
+            SrcChain->blocks().back()->Index == SrcBB &&
+            DstChain->blocks().front()->Index == DstBB) {
+          mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y);
+        }
+      }
+    }
+  }
+
+  /// Compute the Ext-TSP score for a given block order and a list of jumps.
+  double extTSPScore(const MergedChain &MergedBlocks,
+                     const std::vector<Jump *> &Jumps) const {
+    if (Jumps.empty())
+      return 0.0;
+    uint64_t CurAddr = 0;
+    MergedBlocks.forEach([&](const Block *BB) {
+      BB->EstimatedAddr = CurAddr;
+      CurAddr += BB->Size;
+    });
+
+    double Score = 0;
+    for (auto &Jump : Jumps) {
+      const auto SrcBlock = Jump->Source;
+      const auto DstBlock = Jump->Target;
+      Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
+                             DstBlock->EstimatedAddr, Jump->ExecutionCount);
+    }
+    return Score;
+  }
+
+  /// Compute the gain of merging two chains.
+  ///
+  /// The function considers all possible ways of merging two chains and
+  /// computes the one having the largest increase in ExtTSP objective. The
+  /// result is a pair with the first element being the gain and the second
+  /// element being the corresponding merging type.
+  MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc,
+                               ChainEdge *Edge) const {
+    if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
+      return Edge->getCachedMergeGain(ChainPred, ChainSucc);
+    }
+
+    // Precompute jumps between ChainPred and ChainSucc
+    auto Jumps = Edge->jumps();
+    auto EdgePP = ChainPred->getEdge(ChainPred);
+    if (EdgePP != nullptr) {
+      Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end());
+    }
+    assert(!Jumps.empty() && "trying to merge chains w/o jumps");
+
+    // The object holds the best currently chosen gain of merging the two chains
+    MergeGainTy Gain = MergeGainTy();
+
+    /// Given a merge offset and a list of merge types, try to merge two chains
+    /// and update Gain with a better alternative
+    auto tryChainMerging = [&](size_t Offset,
+                               const std::vector<MergeTypeTy> &MergeTypes) {
+      // Skip merging corresponding to concatenation w/o splitting
+      if (Offset == 0 || Offset == ChainPred->blocks().size())
+        return;
+      // Skip merging if it breaks Forced successors
+      auto BB = ChainPred->blocks()[Offset - 1];
+      if (BB->ForcedSucc != nullptr)
+        return;
+      // Apply the merge, compute the corresponding gain, and update the best
+      // value, if the merge is beneficial
+      for (auto &MergeType : MergeTypes) {
+        Gain.updateIfLessThan(
+            computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
+      }
+    };
+
+    // Try to concatenate two chains w/o splitting
+    Gain.updateIfLessThan(
+        computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y));
+
+    if (EnableChainSplitAlongJumps) {
+      // Attach (a part of) ChainPred before the first block of ChainSucc
+      for (auto &Jump : ChainSucc->blocks().front()->InJumps) {
+        const auto SrcBlock = Jump->Source;
+        if (SrcBlock->CurChain != ChainPred)
+          continue;
+        size_t Offset = SrcBlock->CurIndex + 1;
+        tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y});
+      }
+
+      // Attach (a part of) ChainPred after the last block of ChainSucc
+      for (auto &Jump : ChainSucc->blocks().back()->OutJumps) {
+        const auto DstBlock = Jump->Source;
+        if (DstBlock->CurChain != ChainPred)
+          continue;
+        size_t Offset = DstBlock->CurIndex;
+        tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1});
+      }
+    }
+
+    // Try to break ChainPred in various ways and concatenate with ChainSucc
+    if (ChainPred->blocks().size() <= ChainSplitThreshold) {
+      for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) {
+        // Try to split the chain in different ways. In practice, applying
+        // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
+        // it from consideration to reduce the search space
+        tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1,
+                                 MergeTypeTy::X2_X1_Y});
+      }
+    }
+    Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
+    return Gain;
+  }
+
+  /// Compute the score gain of merging two chains, respecting a given
+  /// merge 'type' and 'offset'.
+  ///
+  /// The two chains are not modified in the method.
+  MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc,
+                               const std::vector<Jump *> &Jumps,
+                               size_t MergeOffset,
+                               MergeTypeTy MergeType) const {
+    auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(),
+                                    MergeOffset, MergeType);
+
+    // Do not allow a merge that does not preserve the original entry block
+    if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
+        !MergedBlocks.getFirstBlock()->isEntry())
+      return MergeGainTy();
+
+    // The gain for the new chain
+    auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score();
+    return MergeGainTy(NewGainScore, MergeOffset, MergeType);
+  }
+
+  /// Merge two chains of blocks respecting a given merge 'type' and 'offset'.
+  ///
+  /// If MergeType == 0, then the result is a concatentation of two chains.
+  /// Otherwise, the first chain is cut into two sub-chains at the offset,
+  /// and merged using all possible ways of concatenating three chains.
+  MergedChain mergeBlocks(const std::vector<Block *> &X,
+                          const std::vector<Block *> &Y, size_t MergeOffset,
+                          MergeTypeTy MergeType) const {
+    // Split the first chain, X, into X1 and X2
+    BlockIter BeginX1 = X.begin();
+    BlockIter EndX1 = X.begin() + MergeOffset;
+    BlockIter BeginX2 = X.begin() + MergeOffset;
+    BlockIter EndX2 = X.end();
+    BlockIter BeginY = Y.begin();
+    BlockIter EndY = Y.end();
+
+    // Construct a new chain from the three existing ones
+    switch (MergeType) {
+    case MergeTypeTy::X_Y:
+      return MergedChain(BeginX1, EndX2, BeginY, EndY);
+    case MergeTypeTy::X1_Y_X2:
+      return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
+    case MergeTypeTy::Y_X2_X1:
+      return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
+    case MergeTypeTy::X2_X1_Y:
+      return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
+    }
+    llvm_unreachable("unexpected chain merge type");
+  }
+
+  /// Merge chain From into chain Into, update the list of active chains,
+  /// adjacency information, and the corresponding cached values.
+  void mergeChains(Chain *Into, Chain *From, size_t MergeOffset,
+                   MergeTypeTy MergeType) {
+    assert(Into != From && "a chain cannot be merged with itself");
+
+    // Merge the blocks
+    auto MergedBlocks =
+        mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType);
+    Into->merge(From, MergedBlocks.getBlocks());
+    Into->mergeEdges(From);
+    From->clear();
+
+    // Update cached ext-tsp score for the new chain
+    auto SelfEdge = Into->getEdge(Into);
+    if (SelfEdge != nullptr) {
+      MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end());
+      Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps()));
+    }
+
+    // Remove chain From from the list of active chains
+    auto Iter = std::remove(HotChains.begin(), HotChains.end(), From);
+    HotChains.erase(Iter, HotChains.end());
+
+    // Invalidate caches
+    for (auto EdgeIter : Into->edges()) {
+      EdgeIter.second->invalidateCache();
+    }
+  }
+
+  /// Concatenate all chains into a final order of blocks.
+  void concatChains(std::vector<uint64_t> &Order) {
+    // Collect chains and calculate some stats for their sorting
+    std::vector<Chain *> SortedChains;
+    DenseMap<const Chain *, double> ChainDensity;
+    for (auto &Chain : AllChains) {
+      if (!Chain.blocks().empty()) {
+        SortedChains.push_back(&Chain);
+        // Using doubles to avoid overflow of ExecutionCount
+        double Size = 0;
+        double ExecutionCount = 0;
+        for (auto Block : Chain.blocks()) {
+          Size += static_cast<double>(Block->Size);
+          ExecutionCount += static_cast<double>(Block->ExecutionCount);
+        }
+        assert(Size > 0 && "a chain of zero size");
+        ChainDensity[&Chain] = ExecutionCount / Size;
+      }
+    }
+
+    // Sorting chains by density in the decreasing order
+    std::stable_sort(SortedChains.begin(), SortedChains.end(),
+                     [&](const Chain *C1, const Chain *C2) {
+                       // Makre sure the original entry block is at the
+                       // beginning of the order
+                       if (C1->isEntry() != C2->isEntry()) {
+                         return C1->isEntry();
+                       }
+
+                       const double D1 = ChainDensity[C1];
+                       const double D2 = ChainDensity[C2];
+                       // Compare by density and break ties by chain identifiers
+                       return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id());
+                     });
+
+    // Collect the blocks in the order specified by their chains
+    Order.reserve(NumNodes);
+    for (auto Chain : SortedChains) {
+      for (auto Block : Chain->blocks()) {
+        Order.push_back(Block->Index);
+      }
+    }
+  }
+
+private:
+  /// The number of nodes in the graph.
+  const size_t NumNodes;
+
+  /// Successors of each node.
+  std::vector<std::vector<uint64_t>> SuccNodes;
+
+  /// Predecessors of each node.
+  std::vector<std::vector<uint64_t>> PredNodes;
+
+  /// All basic blocks.
+  std::vector<Block> AllBlocks;
+
+  /// All jumps between blocks.
+  std::vector<Jump> AllJumps;
+
+  /// All chains of basic blocks.
+  std::vector<Chain> AllChains;
+
+  /// All edges between chains.
+  std::vector<ChainEdge> AllEdges;
+
+  /// Active chains. The vector gets updated at runtime when chains are merged.
+  std::vector<Chain *> HotChains;
+};
+
+} // end of anonymous namespace
+
+std::vector<uint64_t> llvm::applyExtTspLayout(
+    const std::vector<uint64_t> &NodeSizes,
+    const std::vector<uint64_t> &NodeCounts,
+    const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
+  size_t NumNodes = NodeSizes.size();
+
+  // Verify correctness of the input data.
+  assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
+  assert(NumNodes > 2 && "Incorrect input");
+
+  // Apply the reordering algorithm.
+  auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts);
+  std::vector<uint64_t> Result;
+  Alg.run(Result);
+
+  // Verify correctness of the output.
+  assert(Result.front() == 0 && "Original entry point is not preserved");
+  assert(Result.size() == NumNodes && "Incorrect size of reordered layout");
+  return Result;
+}
+
+double llvm::calcExtTspScore(
+    const std::vector<uint64_t> &Order, const std::vector<uint64_t> &NodeSizes,
+    const std::vector<uint64_t> &NodeCounts,
+    const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
+  // Estimate addresses of the blocks in memory
+  auto Addr = std::vector<uint64_t>(NodeSizes.size(), 0);
+  for (size_t Idx = 1; Idx < Order.size(); Idx++) {
+    Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
+  }
+
+  // Increase the score for each jump
+  double Score = 0;
+  for (auto It : EdgeCounts) {
+    auto Pred = It.first.first;
+    auto Succ = It.first.second;
+    uint64_t Count = It.second;
+    Score += extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count);
+  }
+  return Score;
+}
+
+double llvm::calcExtTspScore(
+    const std::vector<uint64_t> &NodeSizes,
+    const std::vector<uint64_t> &NodeCounts,
+    const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
+  auto Order = std::vector<uint64_t>(NodeSizes.size());
+  for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
+    Order[Idx] = Idx;
+  }
+  return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
+}
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index fc7083b0c30d..589622d69578 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -596,7 +596,7 @@ bool llvm::checkDebugInfoMetadata(Module &M,
   auto DILocsBefore = DIPreservationMap[NameOfWrappedPass].DILocations;
   auto DILocsAfter = DIPreservationAfter[NameOfWrappedPass].DILocations;
 
-  auto InstToDelete = DIPreservationAfter[NameOfWrappedPass].InstToDelete;
+  auto InstToDelete = DIPreservationMap[NameOfWrappedPass].InstToDelete;
 
   auto DIVarsBefore = DIPreservationMap[NameOfWrappedPass].DIVariables;
   auto DIVarsAfter = DIPreservationAfter[NameOfWrappedPass].DIVariables;
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 326864803d7c..06596f7b04e1 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -58,6 +58,14 @@ int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
   return 0;
 }
 
+int FunctionComparator::cmpAligns(Align L, Align R) const {
+  if (L.value() < R.value())
+    return -1;
+  if (L.value() > R.value())
+    return 1;
+  return 0;
+}
+
 int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const {
   if ((int)L < (int)R)
     return -1;
@@ -556,13 +564,12 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res = cmpTypes(AI->getAllocatedType(),
                            cast<AllocaInst>(R)->getAllocatedType()))
       return Res;
-    return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment());
+    return cmpAligns(AI->getAlign(), cast<AllocaInst>(R)->getAlign());
   }
   if (const LoadInst *LI = dyn_cast<LoadInst>(L)) {
     if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile()))
       return Res;
-    if (int Res =
-            cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment()))
+    if (int Res = cmpAligns(LI->getAlign(), cast<LoadInst>(R)->getAlign()))
       return Res;
     if (int Res =
             cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
@@ -578,8 +585,7 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res =
             cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile()))
       return Res;
-    if (int Res =
-            cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment()))
+    if (int Res = cmpAligns(SI->getAlign(), cast<StoreInst>(R)->getAlign()))
       return Res;
     if (int Res =
             cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index ec926b1f5a94..ecad79b68185 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -402,6 +402,18 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
   return wouldInstructionBeTriviallyDead(I, TLI);
 }
 
+bool llvm::wouldInstructionBeTriviallyDeadOnUnusedPaths(
+    Instruction *I, const TargetLibraryInfo *TLI) {
+  // Instructions that are "markers" and have implied meaning on code around
+  // them (without explicit uses), are not dead on unused paths.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    if (II->getIntrinsicID() == Intrinsic::stacksave ||
+        II->getIntrinsicID() == Intrinsic::launder_invariant_group ||
+        II->isLifetimeStartOrEnd())
+      return false;
+  return wouldInstructionBeTriviallyDead(I, TLI);
+}
+
 bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
                                            const TargetLibraryInfo *TLI) {
   if (I->isTerminator())
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index f3cf42be8ba1..69fd110dc3c2 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -104,9 +104,7 @@ bool llvm::canPeel(Loop *L) {
   // note that LoopPeeling currently can only update the branch weights of latch
   // blocks and branch weights to blocks with deopt or unreachable do not need
   // updating.
-  return all_of(Exits, [](const BasicBlock *BB) {
-    return IsBlockFollowedByDeoptOrUnreachable(BB);
-  });
+  return llvm::all_of(Exits, IsBlockFollowedByDeoptOrUnreachable);
 }
 
 // This function calculates the number of iterations after which the given Phi
@@ -333,6 +331,31 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
   return DesiredPeelCount;
 }
 
+/// This "heuristic" exactly matches implicit behavior which used to exist
+/// inside getLoopEstimatedTripCount.  It was added here to keep an
+/// improvement inside that API from causing peeling to become more agressive.
+/// This should probably be removed.
+static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch)
+    return true;
+
+  BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
+    return true;
+
+  assert((LatchBR->getSuccessor(0) == L->getHeader() ||
+          LatchBR->getSuccessor(1) == L->getHeader()) &&
+         "At least one edge out of the latch must go to the header");
+
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getUniqueNonLatchExitBlocks(ExitBlocks);
+  return any_of(ExitBlocks, [](const BasicBlock *EB) {
+      return !EB->getTerminatingDeoptimizeCall();
+    });
+}
+
+
 // Return the number of iterations we want to peel off.
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                             TargetTransformInfo::PeelingPreferences &PP,
@@ -436,6 +459,8 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   // We only do this in the presence of profile information, since otherwise
   // our estimates of the trip count are not reliable enough.
   if (L->getHeader()->getParent()->hasProfileData()) {
+    if (violatesLegacyMultiExitLoopCheck(L))
+      return;
     Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
     if (!PeelCount)
       return;
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index c8e42acdffb3..93157bd87c34 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -773,8 +773,8 @@ void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
 }
 
 
-/// Checks if \p L has single exit through latch block except possibly
-/// "deoptimizing" exits. Returns branch instruction terminating the loop
+/// Checks if \p L has an exiting latch branch.  There may also be other
+/// exiting blocks.  Returns branch instruction terminating the loop
 /// latch if above check is successful, nullptr otherwise.
 static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
   BasicBlock *Latch = L->getLoopLatch();
@@ -789,53 +789,61 @@ static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
           LatchBR->getSuccessor(1) == L->getHeader()) &&
          "At least one edge out of the latch must go to the header");
 
-  SmallVector<BasicBlock *, 4> ExitBlocks;
-  L->getUniqueNonLatchExitBlocks(ExitBlocks);
-  if (any_of(ExitBlocks, [](const BasicBlock *EB) {
-        return !EB->getTerminatingDeoptimizeCall();
-      }))
-    return nullptr;
-
   return LatchBR;
 }
 
-Optional<unsigned>
-llvm::getLoopEstimatedTripCount(Loop *L,
-                                unsigned *EstimatedLoopInvocationWeight) {
-  // Support loops with an exiting latch and other existing exists only
-  // deoptimize.
-  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
-  if (!LatchBranch)
-    return None;
-
+/// Return the estimated trip count for any exiting branch which dominates
+/// the loop latch.
+static Optional<uint64_t>
+getEstimatedTripCount(BranchInst *ExitingBranch, Loop *L,
+                      uint64_t &OrigExitWeight) {
   // To estimate the number of times the loop body was executed, we want to
   // know the number of times the backedge was taken, vs. the number of times
   // we exited the loop.
-  uint64_t BackedgeTakenWeight, LatchExitWeight;
-  if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
+  uint64_t LoopWeight, ExitWeight;
+  if (!ExitingBranch->extractProfMetadata(LoopWeight, ExitWeight))
     return None;
 
-  if (LatchBranch->getSuccessor(0) != L->getHeader())
-    std::swap(BackedgeTakenWeight, LatchExitWeight);
+  if (L->contains(ExitingBranch->getSuccessor(1)))
+    std::swap(LoopWeight, ExitWeight);
 
-  if (!LatchExitWeight)
+  if (!ExitWeight)
+    // Don't have a way to return predicated infinite
     return None;
 
-  if (EstimatedLoopInvocationWeight)
-    *EstimatedLoopInvocationWeight = LatchExitWeight;
+  OrigExitWeight = ExitWeight;
 
-  // Estimated backedge taken count is a ratio of the backedge taken weight by
-  // the weight of the edge exiting the loop, rounded to nearest.
-  uint64_t BackedgeTakenCount =
-      llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight);
-  // Estimated trip count is one plus estimated backedge taken count.
-  return BackedgeTakenCount + 1;
+  // Estimated exit count is a ratio of the loop weight by the weight of the
+  // edge exiting the loop, rounded to nearest.
+  uint64_t ExitCount = llvm::divideNearest(LoopWeight, ExitWeight);
+  // Estimated trip count is one plus estimated exit count.
+  return ExitCount + 1;
+}
+
+Optional<unsigned>
+llvm::getLoopEstimatedTripCount(Loop *L,
+                                unsigned *EstimatedLoopInvocationWeight) {
+  // Currently we take the estimate exit count only from the loop latch,
+  // ignoring other exiting blocks.  This can overestimate the trip count
+  // if we exit through another exit, but can never underestimate it.
+  // TODO: incorporate information from other exits
+  if (BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L)) {
+    uint64_t ExitWeight;
+    if (Optional<uint64_t> EstTripCount =
+        getEstimatedTripCount(LatchBranch, L, ExitWeight)) {
+      if (EstimatedLoopInvocationWeight)
+        *EstimatedLoopInvocationWeight = ExitWeight;
+      return *EstTripCount;
+    }
+  }
+  return None;
 }
 
 bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
                                      unsigned EstimatedloopInvocationWeight) {
-  // Support loops with an exiting latch and other existing exists only
-  // deoptimize.
+  // At the moment, we currently support changing the estimate trip count of
+  // the latch branch only.  We could extend this API to manipulate estimated
+  // trip counts for any exit.
   BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
   if (!LatchBranch)
     return false;
@@ -923,8 +931,7 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
 
 // Helper to generate an ordered reduction.
 Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
-                                 unsigned Op, RecurKind RdxKind,
-                                 ArrayRef<Value *> RedOps) {
+                                 unsigned Op, RecurKind RdxKind) {
   unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
 
   // Extract and apply reduction ops in ascending order:
@@ -942,9 +949,6 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
              "Invalid min/max");
       Result = createMinMaxOp(Builder, RdxKind, Result, Ext);
     }
-
-    if (!RedOps.empty())
-      propagateIRFlags(Result, RedOps);
   }
 
   return Result;
@@ -952,14 +956,20 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
 
 // Helper to generate a log2 shuffle reduction.
 Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
-                                 unsigned Op, RecurKind RdxKind,
-                                 ArrayRef<Value *> RedOps) {
+                                 unsigned Op, RecurKind RdxKind) {
   unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
   // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
   // and vector ops, reducing the set of values being computed by half each
   // round.
   assert(isPowerOf2_32(VF) &&
          "Reduction emission only supported for pow2 vectors!");
+  // Note: fast-math-flags flags are controlled by the builder configuration
+  // and are assumed to apply to all generated arithmetic instructions.  Other
+  // poison generating flags (nsw/nuw/inbounds/inrange/exact) are not part
+  // of the builder configuration, and since they're not passed explicitly,
+  // will never be relevant here.  Note that it would be generally unsound to
+  // propagate these from an intrinsic call to the expansion anyways as we/
+  // change the order of operations.
   Value *TmpVec = Src;
   SmallVector<int, 32> ShuffleMask(VF);
   for (unsigned i = VF; i != 1; i >>= 1) {
@@ -973,7 +983,6 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
     Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf");
 
     if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
-      // The builder propagates its fast-math-flags setting.
       TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
                                    "bin.rdx");
     } else {
@@ -981,13 +990,6 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
              "Invalid min/max");
       TmpVec = createMinMaxOp(Builder, RdxKind, TmpVec, Shuf);
     }
-    if (!RedOps.empty())
-      propagateIRFlags(TmpVec, RedOps);
-
-    // We may compute the reassociated scalar ops in a way that does not
-    // preserve nsw/nuw etc. Conservatively, drop those flags.
-    if (auto *ReductionInst = dyn_cast<Instruction>(TmpVec))
-      ReductionInst->dropPoisonGeneratingFlags();
   }
   // The result is in the first element of the vector.
   return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
@@ -1035,8 +1037,7 @@ Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder,
 
 Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
                                          const TargetTransformInfo *TTI,
-                                         Value *Src, RecurKind RdxKind,
-                                         ArrayRef<Value *> RedOps) {
+                                         Value *Src, RecurKind RdxKind) {
   auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
   switch (RdxKind) {
   case RecurKind::Add:
diff --git a/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/llvm/lib/Transforms/Utils/MetaRenamer.cpp
index 3ce10535d45f..9fba2f3f86b5 100644
--- a/llvm/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/llvm/lib/Transforms/Utils/MetaRenamer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Transforms/Utils/MetaRenamer.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -31,10 +32,36 @@
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils.h"
 
 using namespace llvm;
 
+static cl::opt<std::string> RenameExcludeFunctionPrefixes(
+    "rename-exclude-function-prefixes",
+    cl::desc("Prefixes for functions that don't need to be renamed, separated "
+             "by a comma"),
+    cl::Hidden);
+
+static cl::opt<std::string> RenameExcludeAliasPrefixes(
+    "rename-exclude-alias-prefixes",
+    cl::desc("Prefixes for aliases that don't need to be renamed, separated "
+             "by a comma"),
+    cl::Hidden);
+
+static cl::opt<std::string> RenameExcludeGlobalPrefixes(
+    "rename-exclude-global-prefixes",
+    cl::desc(
+        "Prefixes for global values that don't need to be renamed, separated "
+        "by a comma"),
+    cl::Hidden);
+
+static cl::opt<std::string> RenameExcludeStructPrefixes(
+    "rename-exclude-struct-prefixes",
+    cl::desc("Prefixes for structs that don't need to be renamed, separated "
+             "by a comma"),
+    cl::Hidden);
+
 static const char *const metaNames[] = {
   // See http://en.wikipedia.org/wiki/Metasyntactic_variable
   "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
@@ -66,6 +93,18 @@ struct Renamer {
   PRNG prng;
 };
 
+static void
+parseExcludedPrefixes(StringRef PrefixesStr,
+                      SmallVectorImpl<StringRef> &ExcludedPrefixes) {
+  for (;;) {
+    auto PrefixesSplit = PrefixesStr.split(',');
+    if (PrefixesSplit.first.empty())
+      break;
+    ExcludedPrefixes.push_back(PrefixesSplit.first);
+    PrefixesStr = PrefixesSplit.second;
+  }
+}
+
 void MetaRename(Function &F) {
   for (Argument &Arg : F.args())
     if (!Arg.getType()->isVoidTy())
@@ -91,10 +130,26 @@ void MetaRename(Module &M,
 
   Renamer renamer(randSeed);
 
+  SmallVector<StringRef, 8> ExcludedAliasesPrefixes;
+  SmallVector<StringRef, 8> ExcludedGlobalsPrefixes;
+  SmallVector<StringRef, 8> ExcludedStructsPrefixes;
+  SmallVector<StringRef, 8> ExcludedFuncPrefixes;
+  parseExcludedPrefixes(RenameExcludeAliasPrefixes, ExcludedAliasesPrefixes);
+  parseExcludedPrefixes(RenameExcludeGlobalPrefixes, ExcludedGlobalsPrefixes);
+  parseExcludedPrefixes(RenameExcludeStructPrefixes, ExcludedStructsPrefixes);
+  parseExcludedPrefixes(RenameExcludeFunctionPrefixes, ExcludedFuncPrefixes);
+
+  auto IsNameExcluded = [](StringRef &Name,
+                           SmallVectorImpl<StringRef> &ExcludedPrefixes) {
+    return any_of(ExcludedPrefixes,
+                  [&Name](auto &Prefix) { return Name.startswith(Prefix); });
+  };
+
   // Rename all aliases
   for (GlobalAlias &GA : M.aliases()) {
     StringRef Name = GA.getName();
-    if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+    if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
+        IsNameExcluded(Name, ExcludedAliasesPrefixes))
       continue;
 
     GA.setName("alias");
@@ -103,7 +158,8 @@ void MetaRename(Module &M,
   // Rename all global variables
   for (GlobalVariable &GV : M.globals()) {
     StringRef Name = GV.getName();
-    if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+    if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
+        IsNameExcluded(Name, ExcludedGlobalsPrefixes))
       continue;
 
     GV.setName("global");
@@ -113,7 +169,9 @@ void MetaRename(Module &M,
   TypeFinder StructTypes;
   StructTypes.run(M, true);
   for (StructType *STy : StructTypes) {
-    if (STy->isLiteral() || STy->getName().empty())
+    StringRef Name = STy->getName();
+    if (STy->isLiteral() || Name.empty() ||
+        IsNameExcluded(Name, ExcludedStructsPrefixes))
       continue;
 
     SmallString<128> NameStorage;
@@ -128,7 +186,8 @@ void MetaRename(Module &M,
     // Leave library functions alone because their presence or absence could
     // affect the behavior of other passes.
     if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
-        GetTLI(F).getLibFunc(F, Tmp))
+        GetTLI(F).getLibFunc(F, Tmp) ||
+        IsNameExcluded(Name, ExcludedFuncPrefixes))
       continue;
 
     // Leave @main alone. The output of -metarenamer might be passed to
diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
index 3ebc89158173..65207056a3f4 100644
--- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
+++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
@@ -144,6 +144,10 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) {
   Value *Offset =
       Builder.CreateShl(Index, ConstantInt::get(IntTy, 2), "reltable.shift");
 
+  // Insert the call to load.relative instrinsic before LOAD.
+  // GEP might not be immediately followed by a LOAD, like it can be hoisted
+  // outside the loop or another instruction might be inserted them in between.
+  Builder.SetInsertPoint(Load);
   Function *LoadRelIntrinsic = llvm::Intrinsic::getDeclaration(
       &M, Intrinsic::load_relative, {Index->getType()});
   Value *Base = Builder.CreateBitCast(RelLookupTable, Builder.getInt8PtrTy());
diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
index 9495e442e0bf..2f2dff6b5f0b 100644
--- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -220,7 +220,7 @@ private:
       Now = Pred;
     }
 
-    assert(PathCapacity > 0 && "found incorrect augmenting path");
+    assert(PathCapacity > 0 && "found an incorrect augmenting path");
 
     // Update the flow along the path
     Now = Target;
@@ -271,6 +271,352 @@ private:
   uint64_t Target;
 };
 
+/// A post-processing adjustment of control flow. It applies two steps by
+/// rerouting some flow and making it more realistic:
+///
+/// - First, it removes all isolated components ("islands") with a positive flow
+///   that are unreachable from the entry block. For every such component, we
+///   find the shortest from the entry to an exit passing through the component,
+///   and increase the flow by one unit along the path.
+///
+/// - Second, it identifies all "unknown subgraphs" consisting of basic blocks
+///   with no sampled counts. Then it rebalnces the flow that goes through such
+///   a subgraph so that each branch is taken with probability 50%.
+///   An unknown subgraph is such that for every two nodes u and v:
+///     - u dominates v and u is not unknown;
+///     - v post-dominates u; and
+///     - all inner-nodes of all (u,v)-paths are unknown.
+///
+class FlowAdjuster {
+public:
+  FlowAdjuster(FlowFunction &Func) : Func(Func) {
+    assert(Func.Blocks[Func.Entry].isEntry() &&
+           "incorrect index of the entry block");
+  }
+
+  // Run the post-processing
+  void run() {
+    /// Adjust the flow to get rid of isolated components.
+    joinIsolatedComponents();
+
+    /// Rebalance the flow inside unknown subgraphs.
+    rebalanceUnknownSubgraphs();
+  }
+
+  /// The probability for the first successor of a unknown subgraph
+  static constexpr double UnknownFirstSuccProbability = 0.5;
+
+private:
+  void joinIsolatedComponents() {
+    // Find blocks that are reachable from the source
+    auto Visited = std::vector<bool>(NumBlocks(), false);
+    findReachable(Func.Entry, Visited);
+
+    // Iterate over all non-reachable blocks and adjust their weights
+    for (uint64_t I = 0; I < NumBlocks(); I++) {
+      auto &Block = Func.Blocks[I];
+      if (Block.Flow > 0 && !Visited[I]) {
+        // Find a path from the entry to an exit passing through the block I
+        auto Path = findShortestPath(I);
+        // Increase the flow along the path
+        assert(Path.size() > 0 && Path[0]->Source == Func.Entry &&
+               "incorrectly computed path adjusting control flow");
+        Func.Blocks[Func.Entry].Flow += 1;
+        for (auto &Jump : Path) {
+          Jump->Flow += 1;
+          Func.Blocks[Jump->Target].Flow += 1;
+          // Update reachability
+          findReachable(Jump->Target, Visited);
+        }
+      }
+    }
+  }
+
+  /// Run BFS from a given block along the jumps with a positive flow and mark
+  /// all reachable blocks.
+  void findReachable(uint64_t Src, std::vector<bool> &Visited) {
+    if (Visited[Src])
+      return;
+    std::queue<uint64_t> Queue;
+    Queue.push(Src);
+    Visited[Src] = true;
+    while (!Queue.empty()) {
+      Src = Queue.front();
+      Queue.pop();
+      for (auto Jump : Func.Blocks[Src].SuccJumps) {
+        uint64_t Dst = Jump->Target;
+        if (Jump->Flow > 0 && !Visited[Dst]) {
+          Queue.push(Dst);
+          Visited[Dst] = true;
+        }
+      }
+    }
+  }
+
+  /// Find the shortest path from the entry block to an exit block passing
+  /// through a given block.
+  std::vector<FlowJump *> findShortestPath(uint64_t BlockIdx) {
+    // A path from the entry block to BlockIdx
+    auto ForwardPath = findShortestPath(Func.Entry, BlockIdx);
+    // A path from BlockIdx to an exit block
+    auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock);
+
+    // Concatenate the two paths
+    std::vector<FlowJump *> Result;
+    Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end());
+    Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end());
+    return Result;
+  }
+
+  /// Apply the Dijkstra algorithm to find the shortest path from a given
+  /// Source to a given Target block.
+  /// If Target == -1, then the path ends at an exit block.
+  std::vector<FlowJump *> findShortestPath(uint64_t Source, uint64_t Target) {
+    // Quit early, if possible
+    if (Source == Target)
+      return std::vector<FlowJump *>();
+    if (Func.Blocks[Source].isExit() && Target == AnyExitBlock)
+      return std::vector<FlowJump *>();
+
+    // Initialize data structures
+    auto Distance = std::vector<int64_t>(NumBlocks(), INF);
+    auto Parent = std::vector<FlowJump *>(NumBlocks(), nullptr);
+    Distance[Source] = 0;
+    std::set<std::pair<uint64_t, uint64_t>> Queue;
+    Queue.insert(std::make_pair(Distance[Source], Source));
+
+    // Run the Dijkstra algorithm
+    while (!Queue.empty()) {
+      uint64_t Src = Queue.begin()->second;
+      Queue.erase(Queue.begin());
+      // If we found a solution, quit early
+      if (Src == Target ||
+          (Func.Blocks[Src].isExit() && Target == AnyExitBlock))
+        break;
+
+      for (auto Jump : Func.Blocks[Src].SuccJumps) {
+        uint64_t Dst = Jump->Target;
+        int64_t JumpDist = jumpDistance(Jump);
+        if (Distance[Dst] > Distance[Src] + JumpDist) {
+          Queue.erase(std::make_pair(Distance[Dst], Dst));
+
+          Distance[Dst] = Distance[Src] + JumpDist;
+          Parent[Dst] = Jump;
+
+          Queue.insert(std::make_pair(Distance[Dst], Dst));
+        }
+      }
+    }
+    // If Target is not provided, find the closest exit block
+    if (Target == AnyExitBlock) {
+      for (uint64_t I = 0; I < NumBlocks(); I++) {
+        if (Func.Blocks[I].isExit() && Parent[I] != nullptr) {
+          if (Target == AnyExitBlock || Distance[Target] > Distance[I]) {
+            Target = I;
+          }
+        }
+      }
+    }
+    assert(Parent[Target] != nullptr && "a path does not exist");
+
+    // Extract the constructed path
+    std::vector<FlowJump *> Result;
+    uint64_t Now = Target;
+    while (Now != Source) {
+      assert(Now == Parent[Now]->Target && "incorrect parent jump");
+      Result.push_back(Parent[Now]);
+      Now = Parent[Now]->Source;
+    }
+    // Reverse the path, since it is extracted from Target to Source
+    std::reverse(Result.begin(), Result.end());
+    return Result;
+  }
+
+  /// A distance of a path for a given jump.
+  /// In order to incite the path to use blocks/jumps with large positive flow,
+  /// and avoid changing branch probability of outgoing edges drastically,
+  /// set the distance as follows:
+  ///   if Jump.Flow > 0, then distance = max(100 - Jump->Flow, 0)
+  ///   if Block.Weight > 0, then distance = 1
+  ///   otherwise distance >> 1
+  int64_t jumpDistance(FlowJump *Jump) const {
+    int64_t BaseDistance = 100;
+    if (Jump->IsUnlikely)
+      return MinCostMaxFlow::AuxCostUnlikely;
+    if (Jump->Flow > 0)
+      return std::max(BaseDistance - (int64_t)Jump->Flow, (int64_t)0);
+    if (Func.Blocks[Jump->Target].Weight > 0)
+      return BaseDistance;
+    return BaseDistance * (NumBlocks() + 1);
+  };
+
+  uint64_t NumBlocks() const { return Func.Blocks.size(); }
+
+  /// Rebalance unknown subgraphs so as each branch splits with probabilities
+  /// UnknownFirstSuccProbability and 1 - UnknownFirstSuccProbability
+  void rebalanceUnknownSubgraphs() {
+    assert(UnknownFirstSuccProbability >= 0.0 &&
+           UnknownFirstSuccProbability <= 1.0 &&
+           "the share of the unknown successor should be between 0 and 1");
+    // Try to find unknown subgraphs from each non-unknown block
+    for (uint64_t I = 0; I < Func.Blocks.size(); I++) {
+      auto SrcBlock = &Func.Blocks[I];
+      // Do not attempt to find unknown successors from a unknown or a
+      // zero-flow block
+      if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0)
+        continue;
+
+      std::vector<FlowBlock *> UnknownSuccs;
+      FlowBlock *DstBlock = nullptr;
+      // Find a unknown subgraphs starting at block SrcBlock
+      if (!findUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+        continue;
+      // At the moment, we do not rebalance subgraphs containing cycles among
+      // unknown blocks
+      if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+        continue;
+
+      // Rebalance the flow
+      rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs);
+    }
+  }
+
+  /// Find a unknown subgraph starting at block SrcBlock.
+  /// If the search is successful, the method sets DstBlock and UnknownSuccs.
+  bool findUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock,
+                           std::vector<FlowBlock *> &UnknownSuccs) {
+    // Run BFS from SrcBlock and make sure all paths are going through unknown
+    // blocks and end at a non-unknown DstBlock
+    auto Visited = std::vector<bool>(NumBlocks(), false);
+    std::queue<uint64_t> Queue;
+    DstBlock = nullptr;
+
+    Queue.push(SrcBlock->Index);
+    Visited[SrcBlock->Index] = true;
+    while (!Queue.empty()) {
+      auto &Block = Func.Blocks[Queue.front()];
+      Queue.pop();
+      // Process blocks reachable from Block
+      for (auto Jump : Block.SuccJumps) {
+        uint64_t Dst = Jump->Target;
+        if (Visited[Dst])
+          continue;
+        Visited[Dst] = true;
+        if (!Func.Blocks[Dst].UnknownWeight) {
+          // If we see non-unique non-unknown block reachable from SrcBlock,
+          // stop processing and skip rebalancing
+          FlowBlock *CandidateDstBlock = &Func.Blocks[Dst];
+          if (DstBlock != nullptr && DstBlock != CandidateDstBlock)
+            return false;
+          DstBlock = CandidateDstBlock;
+        } else {
+          Queue.push(Dst);
+          UnknownSuccs.push_back(&Func.Blocks[Dst]);
+        }
+      }
+    }
+
+    // If the list of unknown blocks is empty, we don't need rebalancing
+    if (UnknownSuccs.empty())
+      return false;
+    // If all reachable nodes from SrcBlock are unknown, skip rebalancing
+    if (DstBlock == nullptr)
+      return false;
+    // If any of the unknown blocks is an exit block, skip rebalancing
+    for (auto Block : UnknownSuccs) {
+      if (Block->isExit())
+        return false;
+    }
+
+    return true;
+  }
+
+  /// Verify if the given unknown subgraph is acyclic, and if yes, reorder
+  /// UnknownSuccs in the topological order (so that all jumps are "forward").
+  bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
+                         std::vector<FlowBlock *> &UnknownSuccs) {
+    // Extract local in-degrees in the considered subgraph
+    auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
+    for (auto Jump : SrcBlock->SuccJumps) {
+      LocalInDegree[Jump->Target]++;
+    }
+    for (uint64_t I = 0; I < UnknownSuccs.size(); I++) {
+      for (auto Jump : UnknownSuccs[I]->SuccJumps) {
+        LocalInDegree[Jump->Target]++;
+      }
+    }
+    // A loop containing SrcBlock
+    if (LocalInDegree[SrcBlock->Index] > 0)
+      return false;
+
+    std::vector<FlowBlock *> AcyclicOrder;
+    std::queue<uint64_t> Queue;
+    Queue.push(SrcBlock->Index);
+    while (!Queue.empty()) {
+      auto &Block = Func.Blocks[Queue.front()];
+      Queue.pop();
+      // Stop propagation once we reach DstBlock
+      if (Block.Index == DstBlock->Index)
+        break;
+
+      AcyclicOrder.push_back(&Block);
+      // Add to the queue all successors with zero local in-degree
+      for (auto Jump : Block.SuccJumps) {
+        uint64_t Dst = Jump->Target;
+        LocalInDegree[Dst]--;
+        if (LocalInDegree[Dst] == 0) {
+          Queue.push(Dst);
+        }
+      }
+    }
+
+    // If there is a cycle in the subgraph, AcyclicOrder contains only a subset
+    // of all blocks
+    if (UnknownSuccs.size() + 1 != AcyclicOrder.size())
+      return false;
+    UnknownSuccs = AcyclicOrder;
+    return true;
+  }
+
+  /// Rebalance a given subgraph.
+  void rebalanceUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
+                                std::vector<FlowBlock *> &UnknownSuccs) {
+    assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph");
+    assert(UnknownSuccs.front() == SrcBlock && "incorrect order of unknowns");
+
+    for (auto Block : UnknownSuccs) {
+      // Block's flow is the sum of incoming flows
+      uint64_t TotalFlow = 0;
+      if (Block == SrcBlock) {
+        TotalFlow = Block->Flow;
+      } else {
+        for (auto Jump : Block->PredJumps) {
+          TotalFlow += Jump->Flow;
+        }
+        Block->Flow = TotalFlow;
+      }
+
+      // Process all successor jumps and update corresponding flow values
+      for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) {
+        auto Jump = Block->SuccJumps[I];
+        if (I + 1 == Block->SuccJumps.size()) {
+          Jump->Flow = TotalFlow;
+          continue;
+        }
+        uint64_t Flow = uint64_t(TotalFlow * UnknownFirstSuccProbability);
+        Jump->Flow = Flow;
+        TotalFlow -= Flow;
+      }
+    }
+  }
+
+  /// A constant indicating an arbitrary exit block of a function.
+  static constexpr uint64_t AnyExitBlock = uint64_t(-1);
+
+  /// The function.
+  FlowFunction &Func;
+};
+
 /// Initializing flow network for a given function.
 ///
 /// Every block is split into three nodes that are responsible for (i) an
@@ -440,6 +786,39 @@ void verifyWeights(const FlowFunction &Func) {
     }
   }
   assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
+
+  // Verify that there are no isolated flow components
+  // One could modify FlowFunction to hold edges indexed by the sources, which
+  // will avoid a creation of the object
+  auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks);
+  for (auto &Jump : Func.Jumps) {
+    if (Jump.Flow > 0) {
+      PositiveFlowEdges[Jump.Source].push_back(Jump.Target);
+    }
+  }
+
+  // Run BFS from the source along edges with positive flow
+  std::queue<uint64_t> Queue;
+  auto Visited = std::vector<bool>(NumBlocks, false);
+  Queue.push(Func.Entry);
+  Visited[Func.Entry] = true;
+  while (!Queue.empty()) {
+    uint64_t Src = Queue.front();
+    Queue.pop();
+    for (uint64_t Dst : PositiveFlowEdges[Src]) {
+      if (!Visited[Dst]) {
+        Queue.push(Dst);
+        Visited[Dst] = true;
+      }
+    }
+  }
+
+  // Verify that every block that has a positive flow is reached from the source
+  // along edges with a positive flow
+  for (uint64_t I = 0; I < NumBlocks; I++) {
+    auto &Block = Func.Blocks[I];
+    assert((Visited[I] || Block.Flow == 0) && "an isolated flow component");
+  }
 }
 #endif
 
@@ -455,6 +834,10 @@ void llvm::applyFlowInference(FlowFunction &Func) {
   // Extract flow values for every block and every edge
   extractWeights(InferenceNetwork, Func);
 
+  // Post-processing adjustments to the flow
+  auto Adjuster = FlowAdjuster(Func);
+  Adjuster.run();
+
 #ifndef NDEBUG
   // Verify the result
   verifyWeights(Func);
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 71c15d5c51fc..c840ee85795f 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1047,9 +1047,9 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
     if (SE.DT.dominates(IncV, InsertPos))
       break;
   }
-  for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) {
-    fixupInsertPoints(*I);
-    (*I)->moveBefore(InsertPos);
+  for (Instruction *I : llvm::reverse(IVIncs)) {
+    fixupInsertPoints(I);
+    I->moveBefore(InsertPos);
   }
   return true;
 }
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index afa3ecde77f9..1046998c26de 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3629,7 +3629,7 @@ static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     return false; // TODO
   // Use lambda to lazily compute expensive condition after cheap ones.
   auto NoSideEffects = [](BasicBlock &BB) {
-    return !llvm::any_of(BB, [](const Instruction &I) {
+    return llvm::none_of(BB, [](const Instruction &I) {
         return I.mayWriteToMemory() || I.mayHaveSideEffects();
       });
   };
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index e190a1294eb3..02727a3dbf9c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -193,6 +193,19 @@ static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> A
   }
 }
 
+// Copy CallInst "flags" like musttail, notail, and tail. Return New param for
+// easier chaining. Calls to emit* and B.createCall should probably be wrapped
+// in this function when New is created to replace Old. Callers should take
+// care to check Old.isMustTailCall() if they aren't replacing Old directly
+// with New.
+static Value *copyFlags(const CallInst &Old, Value *New) {
+  assert(!Old.isMustTailCall() && "do not copy musttail call flags");
+  assert(!Old.isNoTailCall() && "do not copy notail call flags");
+  if (auto *NewCI = dyn_cast_or_null<CallInst>(New))
+    NewCI->setTailCallKind(Old.getTailCallKind());
+  return New;
+}
+
 //===----------------------------------------------------------------------===//
 // String and Memory Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -215,7 +228,7 @@ Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilderBase &B) {
   if (Len == 0)
     return Dst;
 
-  return emitStrLenMemCpy(Src, Dst, Len, B);
+  return copyFlags(*CI, emitStrLenMemCpy(Src, Dst, Len, B));
 }
 
 Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
@@ -279,7 +292,7 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) {
 
   // strncat(x, s, c) -> strcat(x, s)
   // s is constant so the strcat can be optimized further.
-  return emitStrLenMemCpy(Src, Dst, SrcLen, B);
+  return copyFlags(*CI, emitStrLenMemCpy(Src, Dst, SrcLen, B));
 }
 
 Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) {
@@ -300,9 +313,11 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) {
     if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
       return nullptr;
 
-    return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
-                      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len),
-                      B, DL, TLI);
+    return copyFlags(
+        *CI,
+        emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
+                   ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), B,
+                   DL, TLI));
   }
 
   // Otherwise, the character is a constant, see if the first argument is
@@ -340,7 +355,7 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilderBase &B) {
   if (!getConstantStringInfo(SrcStr, Str)) {
     // strrchr(s, 0) -> strchr(s, 0)
     if (CharC->isZero())
-      return emitStrChr(SrcStr, '\0', B, TLI);
+      return copyFlags(*CI, emitStrChr(SrcStr, '\0', B, TLI));
     return nullptr;
   }
 
@@ -385,25 +400,28 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) {
     annotateDereferenceableBytes(CI, 1, Len2);
 
   if (Len1 && Len2) {
-    return emitMemCmp(Str1P, Str2P,
-                      ConstantInt::get(DL.getIntPtrType(CI->getContext()),
-                                       std::min(Len1, Len2)),
-                      B, DL, TLI);
+    return copyFlags(
+        *CI, emitMemCmp(Str1P, Str2P,
+                        ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                                         std::min(Len1, Len2)),
+                        B, DL, TLI));
   }
 
   // strcmp to memcmp
   if (!HasStr1 && HasStr2) {
     if (canTransformToMemCmp(CI, Str1P, Len2, DL))
-      return emitMemCmp(
-          Str1P, Str2P,
-          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL,
-          TLI);
+      return copyFlags(
+          *CI,
+          emitMemCmp(Str1P, Str2P,
+                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2),
+                     B, DL, TLI));
   } else if (HasStr1 && !HasStr2) {
     if (canTransformToMemCmp(CI, Str2P, Len1, DL))
-      return emitMemCmp(
-          Str1P, Str2P,
-          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL,
-          TLI);
+      return copyFlags(
+          *CI,
+          emitMemCmp(Str1P, Str2P,
+                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1),
+                     B, DL, TLI));
   }
 
   annotateNonNullNoUndefBasedOnAccess(CI, {0, 1});
@@ -430,7 +448,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
     return ConstantInt::get(CI->getType(), 0);
 
   if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
-    return emitMemCmp(Str1P, Str2P, Size, B, DL, TLI);
+    return copyFlags(*CI, emitMemCmp(Str1P, Str2P, Size, B, DL, TLI));
 
   StringRef Str1, Str2;
   bool HasStr1 = getConstantStringInfo(Str1P, Str1);
@@ -462,17 +480,19 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
   if (!HasStr1 && HasStr2) {
     Len2 = std::min(Len2, Length);
     if (canTransformToMemCmp(CI, Str1P, Len2, DL))
-      return emitMemCmp(
-          Str1P, Str2P,
-          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL,
-          TLI);
+      return copyFlags(
+          *CI,
+          emitMemCmp(Str1P, Str2P,
+                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2),
+                     B, DL, TLI));
   } else if (HasStr1 && !HasStr2) {
     Len1 = std::min(Len1, Length);
     if (canTransformToMemCmp(CI, Str2P, Len1, DL))
-      return emitMemCmp(
-          Str1P, Str2P,
-          ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL,
-          TLI);
+      return copyFlags(
+          *CI,
+          emitMemCmp(Str1P, Str2P,
+                     ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1),
+                     B, DL, TLI));
   }
 
   return nullptr;
@@ -485,7 +505,7 @@ Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilderBase &B) {
   if (SrcLen && Size) {
     annotateDereferenceableBytes(CI, 0, SrcLen);
     if (SrcLen <= Size->getZExtValue() + 1)
-      return emitStrDup(Src, B, TLI);
+      return copyFlags(*CI, emitStrDup(Src, B, TLI));
   }
 
   return nullptr;
@@ -495,7 +515,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
   if (Dst == Src) // strcpy(x,x)  -> x
     return Src;
-  
+
   annotateNonNullNoUndefBasedOnAccess(CI, {0, 1});
   // See if we can get the length of the input string.
   uint64_t Len = GetStringLength(Src);
@@ -511,6 +531,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
                      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
   NewCI->setAttributes(CI->getAttributes());
   NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+  copyFlags(*CI, NewCI);
   return Dst;
 }
 
@@ -520,7 +541,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
 
   // stpcpy(d,s) -> strcpy(d,s) if the result is not used.
   if (CI->use_empty())
-    return emitStrCpy(Dst, Src, B, TLI);
+    return copyFlags(*CI, emitStrCpy(Dst, Src, B, TLI));
 
   if (Dst == Src) { // stpcpy(x,x)  -> x+strlen(x)
     Value *StrLen = emitStrLen(Src, B, DL, TLI);
@@ -544,6 +565,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
   CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV);
   NewCI->setAttributes(CI->getAttributes());
   NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+  copyFlags(*CI, NewCI);
   return DstEnd;
 }
 
@@ -583,6 +605,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
     AttrBuilder ArgAttrs(CI->getAttributes().getParamAttrs(0));
     NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
         CI->getContext(), 0, ArgAttrs));
+    copyFlags(*CI, NewCI);
     return Dst;
   }
 
@@ -606,6 +629,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
                                    ConstantInt::get(DL.getIntPtrType(PT), Len));
   NewCI->setAttributes(CI->getAttributes());
   NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+  copyFlags(*CI, NewCI);
   return Dst;
 }
 
@@ -737,7 +761,7 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilderBase &B) {
 
   // strpbrk(s, "a") -> strchr(s, 'a')
   if (HasS2 && S2.size() == 1)
-    return emitStrChr(CI->getArgOperand(0), S2[0], B, TLI);
+    return copyFlags(*CI, emitStrChr(CI->getArgOperand(0), S2[0], B, TLI));
 
   return nullptr;
 }
@@ -793,7 +817,7 @@ Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilderBase &B) {
 
   // strcspn(s, "") -> strlen(s)
   if (HasS2 && S2.empty())
-    return emitStrLen(CI->getArgOperand(0), B, DL, TLI);
+    return copyFlags(*CI, emitStrLen(CI->getArgOperand(0), B, DL, TLI));
 
   return nullptr;
 }
@@ -1062,7 +1086,7 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) {
     Value *LHS = CI->getArgOperand(0);
     Value *RHS = CI->getArgOperand(1);
     Value *Size = CI->getArgOperand(2);
-    return emitBCmp(LHS, RHS, Size, B, DL, TLI);
+    return copyFlags(*CI, emitBCmp(LHS, RHS, Size, B, DL, TLI));
   }
 
   return nullptr;
@@ -1083,6 +1107,7 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) {
                                    CI->getArgOperand(1), Align(1), Size);
   NewCI->setAttributes(CI->getAttributes());
   NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+  copyFlags(*CI, NewCI);
   return CI->getArgOperand(0);
 }
 
@@ -1110,7 +1135,8 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) {
   size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF);
   if (Pos == StringRef::npos) {
     if (N->getZExtValue() <= SrcStr.size()) {
-      B.CreateMemCpy(Dst, Align(1), Src, Align(1), CI->getArgOperand(3));
+      copyFlags(*CI, B.CreateMemCpy(Dst, Align(1), Src, Align(1),
+                                    CI->getArgOperand(3)));
       return Constant::getNullValue(CI->getType());
     }
     return nullptr;
@@ -1119,7 +1145,7 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) {
   Value *NewN =
       ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue()));
   // memccpy -> llvm.memcpy
-  B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN);
+  copyFlags(*CI, B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN));
   return Pos + 1 <= N->getZExtValue()
              ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN)
              : Constant::getNullValue(CI->getType());
@@ -1136,6 +1162,7 @@ Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) {
   // TODO: Attach return value attributes to the 1st operand to preserve them?
   NewCI->setAttributes(CI->getAttributes());
   NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+  copyFlags(*CI, NewCI);
   return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N);
 }
 
@@ -1150,6 +1177,7 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) {
                                     CI->getArgOperand(1), Align(1), Size);
   NewCI->setAttributes(CI->getAttributes());
   NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+  copyFlags(*CI, NewCI);
   return CI->getArgOperand(0);
 }
 
@@ -1164,12 +1192,13 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) {
   CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1));
   NewCI->setAttributes(CI->getAttributes());
   NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+  copyFlags(*CI, NewCI);
   return CI->getArgOperand(0);
 }
 
 Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) {
   if (isa<ConstantPointerNull>(CI->getArgOperand(0)))
-    return emitMalloc(CI->getArgOperand(1), B, DL, TLI);
+    return copyFlags(*CI, emitMalloc(CI->getArgOperand(1), B, DL, TLI));
 
   return nullptr;
 }
@@ -1190,7 +1219,7 @@ static Value *replaceUnaryCall(CallInst *CI, IRBuilderBase &B,
   Function *F = Intrinsic::getDeclaration(M, IID, CI->getType());
   CallInst *NewCall = B.CreateCall(F, V);
   NewCall->takeName(CI);
-  return NewCall;
+  return copyFlags(*CI, NewCall);
 }
 
 /// Return a variant of Val with float type.
@@ -1311,7 +1340,8 @@ Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) {
 
   Function *FSqrt = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::sqrt,
                                               CI->getType());
-  return B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs");
+  return copyFlags(
+      *CI, B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs"));
 }
 
 static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func,
@@ -1334,14 +1364,16 @@ static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func,
     // sin(-X) --> -sin(X)
     // tan(-X) --> -tan(X)
     if (match(Call->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X)))))
-      return B.CreateFNeg(B.CreateCall(Call->getCalledFunction(), X));
+      return B.CreateFNeg(
+          copyFlags(*Call, B.CreateCall(Call->getCalledFunction(), X)));
     break;
   case LibFunc_cos:
   case LibFunc_cosf:
   case LibFunc_cosl:
     // cos(-X) --> cos(X)
     if (match(Call->getArgOperand(0), m_FNeg(m_Value(X))))
-      return B.CreateCall(Call->getCalledFunction(), X, "cos");
+      return copyFlags(*Call,
+                       B.CreateCall(Call->getCalledFunction(), X, "cos"));
     break;
   default:
     break;
@@ -1476,9 +1508,10 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
       (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
       hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
     if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize()))
-      return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, TLI,
-                                   LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
-                                   B, Attrs);
+      return copyFlags(*Pow,
+                       emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI,
+                                             TLI, LibFunc_ldexp, LibFunc_ldexpf,
+                                             LibFunc_ldexpl, B, Attrs));
   }
 
   // pow(2.0 ** n, x) -> exp2(n * x)
@@ -1496,11 +1529,13 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
       double N = NI.logBase2() * (IsReciprocal ? -1.0 : 1.0);
       Value *FMul = B.CreateFMul(Expo, ConstantFP::get(Ty, N), "mul");
       if (Pow->doesNotAccessMemory())
-        return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
-                            FMul, "exp2");
+        return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration(
+                                                Mod, Intrinsic::exp2, Ty),
+                                            FMul, "exp2"));
       else
-        return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
-                                    LibFunc_exp2l, B, Attrs);
+        return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2,
+                                                    LibFunc_exp2f,
+                                                    LibFunc_exp2l, B, Attrs));
     }
   }
 
@@ -1508,8 +1543,9 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
   // TODO: There is no exp10() intrinsic yet, but some day there shall be one.
   if (match(Base, m_SpecificFP(10.0)) &&
       hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
-    return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f,
-                                LibFunc_exp10l, B, Attrs);
+    return copyFlags(*Pow, emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10,
+                                                LibFunc_exp10f, LibFunc_exp10l,
+                                                B, Attrs));
 
   // pow(x, y) -> exp2(log2(x) * y)
   if (Pow->hasApproxFunc() && Pow->hasNoNaNs() && BaseF->isFiniteNonZero() &&
@@ -1528,11 +1564,13 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
     if (Log) {
       Value *FMul = B.CreateFMul(Log, Expo, "mul");
       if (Pow->doesNotAccessMemory())
-        return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
-                            FMul, "exp2");
+        return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration(
+                                                Mod, Intrinsic::exp2, Ty),
+                                            FMul, "exp2"));
       else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l))
-        return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
-                                    LibFunc_exp2l, B, Attrs);
+        return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2,
+                                                    LibFunc_exp2f,
+                                                    LibFunc_exp2l, B, Attrs));
     }
   }
 
@@ -1595,6 +1633,8 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) {
     Sqrt = B.CreateCall(FAbsFn, Sqrt, "abs");
   }
 
+  Sqrt = copyFlags(*Pow, Sqrt);
+
   // Handle non finite base by expanding to
   // (x == -infinity ? +infinity : sqrt(x)).
   if (!Pow->hasNoInfs()) {
@@ -1721,15 +1761,18 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
     if (ExpoF->isInteger() &&
         ExpoF->convertToInteger(IntExpo, APFloat::rmTowardZero, &Ignored) ==
             APFloat::opOK) {
-      return createPowWithIntegerExponent(
-          Base, ConstantInt::get(B.getIntNTy(TLI->getIntSize()), IntExpo), M, B);
+      return copyFlags(
+          *Pow,
+          createPowWithIntegerExponent(
+              Base, ConstantInt::get(B.getIntNTy(TLI->getIntSize()), IntExpo),
+              M, B));
     }
   }
 
   // powf(x, itofp(y)) -> powi(x, y)
   if (AllowApprox && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo))) {
     if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize()))
-      return createPowWithIntegerExponent(Base, ExpoI, M, B);
+      return copyFlags(*Pow, createPowWithIntegerExponent(Base, ExpoI, M, B));
   }
 
   // Shrink pow() to powf() if the arguments are single precision,
@@ -1792,7 +1835,8 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) {
   Intrinsic::ID IID = Callee->getName().startswith("fmin") ? Intrinsic::minnum
                                                            : Intrinsic::maxnum;
   Function *F = Intrinsic::getDeclaration(CI->getModule(), IID, CI->getType());
-  return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) });
+  return copyFlags(
+      *CI, B.CreateCall(F, {CI->getArgOperand(0), CI->getArgOperand(1)}));
 }
 
 Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
@@ -2010,9 +2054,9 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
     // of the square root calculation.
     Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType);
     Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt");
-    return B.CreateFMul(FabsCall, SqrtCall);
+    return copyFlags(*CI, B.CreateFMul(FabsCall, SqrtCall));
   }
-  return FabsCall;
+  return copyFlags(*CI, FabsCall);
 }
 
 // TODO: Generalize to handle any trig function and its inverse.
@@ -2327,7 +2371,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
 
   // printf("x") -> putchar('x'), even for "%" and "%%".
   if (FormatStr.size() == 1 || FormatStr == "%%")
-    return emitPutChar(B.getInt32(FormatStr[0]), B, TLI);
+    return copyFlags(*CI, emitPutChar(B.getInt32(FormatStr[0]), B, TLI));
 
   // Try to remove call or emit putchar/puts.
   if (FormatStr == "%s" && CI->arg_size() > 1) {
@@ -2339,12 +2383,12 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
       return (Value *)CI;
     // printf("%s", "a") --> putchar('a')
     if (OperandStr.size() == 1)
-      return emitPutChar(B.getInt32(OperandStr[0]), B, TLI);
+      return copyFlags(*CI, emitPutChar(B.getInt32(OperandStr[0]), B, TLI));
     // printf("%s", str"\n") --> puts(str)
     if (OperandStr.back() == '\n') {
       OperandStr = OperandStr.drop_back();
       Value *GV = B.CreateGlobalString(OperandStr, "str");
-      return emitPutS(GV, B, TLI);
+      return copyFlags(*CI, emitPutS(GV, B, TLI));
     }
     return nullptr;
   }
@@ -2356,19 +2400,19 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
     // pass to be run after this pass, to merge duplicate strings.
     FormatStr = FormatStr.drop_back();
     Value *GV = B.CreateGlobalString(FormatStr, "str");
-    return emitPutS(GV, B, TLI);
+    return copyFlags(*CI, emitPutS(GV, B, TLI));
   }
 
   // Optimize specific format strings.
   // printf("%c", chr) --> putchar(chr)
   if (FormatStr == "%c" && CI->arg_size() > 1 &&
       CI->getArgOperand(1)->getType()->isIntegerTy())
-    return emitPutChar(CI->getArgOperand(1), B, TLI);
+    return copyFlags(*CI, emitPutChar(CI->getArgOperand(1), B, TLI));
 
   // printf("%s\n", str) --> puts(str)
   if (FormatStr == "%s\n" && CI->arg_size() > 1 &&
       CI->getArgOperand(1)->getType()->isPointerTy())
-    return emitPutS(CI->getArgOperand(1), B, TLI);
+    return copyFlags(*CI, emitPutS(CI->getArgOperand(1), B, TLI));
   return nullptr;
 }
 
@@ -2459,7 +2503,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
 
     if (CI->use_empty())
       // sprintf(dest, "%s", str) -> strcpy(dest, str)
-      return emitStrCpy(Dest, CI->getArgOperand(2), B, TLI);
+      return copyFlags(*CI, emitStrCpy(Dest, CI->getArgOperand(2), B, TLI));
 
     uint64_t SrcLen = GetStringLength(CI->getArgOperand(2));
     if (SrcLen) {
@@ -2558,10 +2602,12 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
 
     // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt,
     // strlen(fmt)+1)
-    B.CreateMemCpy(
-        CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1),
-        ConstantInt::get(DL.getIntPtrType(CI->getContext()),
-                         FormatStr.size() + 1)); // Copy the null byte.
+    copyFlags(
+        *CI,
+        B.CreateMemCpy(
+            CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1),
+            ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                             FormatStr.size() + 1))); // Copy the null byte.
     return ConstantInt::get(CI->getType(), FormatStr.size());
   }
 
@@ -2599,8 +2645,10 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
       else if (N < Str.size() + 1)
         return nullptr;
 
-      B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(3),
-                     Align(1), ConstantInt::get(CI->getType(), Str.size() + 1));
+      copyFlags(
+          *CI, B.CreateMemCpy(CI->getArgOperand(0), Align(1),
+                              CI->getArgOperand(3), Align(1),
+                              ConstantInt::get(CI->getType(), Str.size() + 1)));
 
       // The snprintf result is the unincremented number of bytes in the string.
       return ConstantInt::get(CI->getType(), Str.size());
@@ -2640,10 +2688,11 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
     if (FormatStr.contains('%'))
       return nullptr; // We found a format specifier.
 
-    return emitFWrite(
-        CI->getArgOperand(1),
-        ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()),
-        CI->getArgOperand(0), B, DL, TLI);
+    return copyFlags(
+        *CI, emitFWrite(CI->getArgOperand(1),
+                        ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                                         FormatStr.size()),
+                        CI->getArgOperand(0), B, DL, TLI));
   }
 
   // The remaining optimizations require the format string to be "%s" or "%c"
@@ -2656,14 +2705,16 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
     // fprintf(F, "%c", chr) --> fputc(chr, F)
     if (!CI->getArgOperand(2)->getType()->isIntegerTy())
       return nullptr;
-    return emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+    return copyFlags(
+        *CI, emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI));
   }
 
   if (FormatStr[1] == 's') {
     // fprintf(F, "%s", str) --> fputs(str, F)
     if (!CI->getArgOperand(2)->getType()->isPointerTy())
       return nullptr;
-    return emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+    return copyFlags(
+        *CI, emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI));
   }
   return nullptr;
 }
@@ -2750,10 +2801,11 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilderBase &B) {
     return nullptr;
 
   // Known to have no uses (see above).
-  return emitFWrite(
-      CI->getArgOperand(0),
-      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1),
-      CI->getArgOperand(1), B, DL, TLI);
+  return copyFlags(
+      *CI,
+      emitFWrite(CI->getArgOperand(0),
+                 ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1),
+                 CI->getArgOperand(1), B, DL, TLI));
 }
 
 Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) {
@@ -2765,15 +2817,16 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) {
   // puts("") -> putchar('\n')
   StringRef Str;
   if (getConstantStringInfo(CI->getArgOperand(0), Str) && Str.empty())
-    return emitPutChar(B.getInt32('\n'), B, TLI);
+    return copyFlags(*CI, emitPutChar(B.getInt32('\n'), B, TLI));
 
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) {
   // bcopy(src, dst, n) -> llvm.memmove(dst, src, n)
-  return B.CreateMemMove(CI->getArgOperand(1), Align(1), CI->getArgOperand(0),
-                         Align(1), CI->getArgOperand(2));
+  return copyFlags(*CI, B.CreateMemMove(CI->getArgOperand(1), Align(1),
+                                        CI->getArgOperand(0), Align(1),
+                                        CI->getArgOperand(2)));
 }
 
 bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
@@ -2971,6 +3024,8 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
 }
 
 Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
+  assert(!CI->isMustTailCall() && "These transforms aren't musttail safe.");
+
   // TODO: Split out the code below that operates on FP calls so that
   //       we can all non-FP calls with the StrictFP attribute to be
   //       optimized.
@@ -3212,6 +3267,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
                        Align(1), CI->getArgOperand(2));
     NewCI->setAttributes(CI->getAttributes());
     NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+    copyFlags(*CI, NewCI);
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -3225,6 +3281,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
                         Align(1), CI->getArgOperand(2));
     NewCI->setAttributes(CI->getAttributes());
     NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+    copyFlags(*CI, NewCI);
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -3238,6 +3295,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
                                      CI->getArgOperand(2), Align(1));
     NewCI->setAttributes(CI->getAttributes());
     NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+    copyFlags(*CI, NewCI);
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -3252,7 +3310,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI,
       CallInst *NewCI = cast<CallInst>(Call);
       NewCI->setAttributes(CI->getAttributes());
       NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
-      return NewCI;
+      return copyFlags(*CI, NewCI);
     }
   return nullptr;
 }
@@ -3277,9 +3335,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   // string lengths for varying.
   if (isFortifiedCallFoldable(CI, 2, None, 1)) {
     if (Func == LibFunc_strcpy_chk)
-      return emitStrCpy(Dst, Src, B, TLI);
+      return copyFlags(*CI, emitStrCpy(Dst, Src, B, TLI));
     else
-      return emitStpCpy(Dst, Src, B, TLI);
+      return copyFlags(*CI, emitStpCpy(Dst, Src, B, TLI));
   }
 
   if (OnlyLowerUnknownSize)
@@ -3303,14 +3361,14 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   // a __memcpy_chk, we still need to return the correct end pointer.
   if (Ret && Func == LibFunc_stpcpy_chk)
     return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1));
-  return Ret;
+  return copyFlags(*CI, cast<CallInst>(Ret));
 }
 
 Value *FortifiedLibCallSimplifier::optimizeStrLenChk(CallInst *CI,
                                                      IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 1, None, 0))
-    return emitStrLen(CI->getArgOperand(0), B, CI->getModule()->getDataLayout(),
-                      TLI);
+    return copyFlags(*CI, emitStrLen(CI->getArgOperand(0), B,
+                                     CI->getModule()->getDataLayout(), TLI));
   return nullptr;
 }
 
@@ -3319,11 +3377,13 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
                                                        LibFunc Func) {
   if (isFortifiedCallFoldable(CI, 3, 2)) {
     if (Func == LibFunc_strncpy_chk)
-      return emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                               CI->getArgOperand(2), B, TLI);
+      return copyFlags(*CI,
+                       emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                                   CI->getArgOperand(2), B, TLI));
     else
-      return emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                         CI->getArgOperand(2), B, TLI);
+      return copyFlags(*CI,
+                       emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                                   CI->getArgOperand(2), B, TLI));
   }
 
   return nullptr;
@@ -3332,8 +3392,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI,
                                                       IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 4, 3))
-    return emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                       CI->getArgOperand(2), CI->getArgOperand(3), B, TLI);
+    return copyFlags(
+        *CI, emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                         CI->getArgOperand(2), CI->getArgOperand(3), B, TLI));
 
   return nullptr;
 }
@@ -3342,8 +3403,9 @@ Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI,
                                                        IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) {
     SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 5));
-    return emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
-                        CI->getArgOperand(4), VariadicArgs, B, TLI);
+    return copyFlags(*CI,
+                     emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
+                                  CI->getArgOperand(4), VariadicArgs, B, TLI));
   }
 
   return nullptr;
@@ -3353,8 +3415,9 @@ Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI,
                                                       IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 2, None, None, 1)) {
     SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 4));
-    return emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), VariadicArgs,
-                       B, TLI);
+    return copyFlags(*CI,
+                     emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3),
+                                 VariadicArgs, B, TLI));
   }
 
   return nullptr;
@@ -3363,7 +3426,8 @@ Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI,
                                                      IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 2))
-    return emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI);
+    return copyFlags(
+        *CI, emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI));
 
   return nullptr;
 }
@@ -3371,8 +3435,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI,
                                                    IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3))
-    return emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1),
-                       CI->getArgOperand(2), B, TLI);
+    return copyFlags(*CI,
+                     emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1),
+                                 CI->getArgOperand(2), B, TLI));
 
   return nullptr;
 }
@@ -3380,8 +3445,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI,
                                                       IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3))
-    return emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1),
-                       CI->getArgOperand(2), B, TLI);
+    return copyFlags(*CI,
+                     emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1),
+                                 CI->getArgOperand(2), B, TLI));
 
   return nullptr;
 }
@@ -3389,8 +3455,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI,
                                                       IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3))
-    return emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                       CI->getArgOperand(2), B, TLI);
+    return copyFlags(*CI,
+                     emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                                 CI->getArgOperand(2), B, TLI));
 
   return nullptr;
 }
@@ -3398,8 +3465,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI,
                                                         IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3, 1, None, 2))
-    return emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
-                         CI->getArgOperand(4), CI->getArgOperand(5), B, TLI);
+    return copyFlags(
+        *CI, emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
+                           CI->getArgOperand(4), CI->getArgOperand(5), B, TLI));
 
   return nullptr;
 }
@@ -3407,8 +3475,9 @@ Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI,
                                                        IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 2, None, None, 1))
-    return emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3),
-                        CI->getArgOperand(4), B, TLI);
+    return copyFlags(*CI,
+                     emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3),
+                                  CI->getArgOperand(4), B, TLI));
 
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index c3eafd6b2492..b822db938af8 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -450,6 +450,12 @@ Value *Mapper::mapValue(const Value *V) {
                DSOLocalEquivalent::get(Func), NewTy);
   }
 
+  if (const auto *NC = dyn_cast<NoCFIValue>(C)) {
+    auto *Val = mapValue(NC->getGlobalValue());
+    GlobalValue *GV = cast<GlobalValue>(Val);
+    return getVM()[NC] = NoCFIValue::get(GV);
+  }
+
   auto mapValueOrNull = [this](Value *V) {
     auto Mapped = mapValue(V);
     assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) &&
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 805011191da0..81e5aa223c07 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -55,22 +55,23 @@ static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
     cl::desc("The maximum number of SCEV checks allowed with a "
              "vectorize(enable) pragma"));
 
-// FIXME: When scalable vectorization is stable enough, change the default
-// to SK_PreferFixedWidth.
-static cl::opt<LoopVectorizeHints::ScalableForceKind> ScalableVectorization(
-    "scalable-vectorization", cl::init(LoopVectorizeHints::SK_FixedWidthOnly),
-    cl::Hidden,
-    cl::desc("Control whether the compiler can use scalable vectors to "
-             "vectorize a loop"),
-    cl::values(
-        clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
-                   "Scalable vectorization is disabled."),
-        clEnumValN(LoopVectorizeHints::SK_PreferFixedWidth, "on",
-                   "Scalable vectorization is available, but favor fixed-width "
-                   "vectorization when the cost is inconclusive."),
-        clEnumValN(LoopVectorizeHints::SK_PreferScalable, "preferred",
-                   "Scalable vectorization is available and favored when the "
-                   "cost is inconclusive.")));
+static cl::opt<LoopVectorizeHints::ScalableForceKind>
+    ForceScalableVectorization(
+        "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified),
+        cl::Hidden,
+        cl::desc("Control whether the compiler can use scalable vectors to "
+                 "vectorize a loop"),
+        cl::values(
+            clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
+                       "Scalable vectorization is disabled."),
+            clEnumValN(
+                LoopVectorizeHints::SK_PreferScalable, "preferred",
+                "Scalable vectorization is available and favored when the "
+                "cost is inconclusive."),
+            clEnumValN(
+                LoopVectorizeHints::SK_PreferScalable, "on",
+                "Scalable vectorization is available and favored when the "
+                "cost is inconclusive.")));
 
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
@@ -95,7 +96,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
 
 LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
                                        bool InterleaveOnlyWhenForced,
-                                       OptimizationRemarkEmitter &ORE)
+                                       OptimizationRemarkEmitter &ORE,
+                                       const TargetTransformInfo *TTI)
     : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
       Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
       Force("vectorize.enable", FK_Undefined, HK_FORCE),
@@ -110,14 +112,32 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
   if (VectorizerParams::isInterleaveForced())
     Interleave.Value = VectorizerParams::VectorizationInterleave;
 
+  // If the metadata doesn't explicitly specify whether to enable scalable
+  // vectorization, then decide based on the following criteria (increasing
+  // level of priority):
+  //  - Target default
+  //  - Metadata width
+  //  - Force option (always overrides)
+  if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
+    if (TTI)
+      Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
+                                                          : SK_FixedWidthOnly;
+
+    if (Width.Value)
+      // If the width is set, but the metadata says nothing about the scalable
+      // property, then assume it concerns only a fixed-width UserVF.
+      // If width is not set, the flag takes precedence.
+      Scalable.Value = SK_FixedWidthOnly;
+  }
+
+  // If the flag is set to force any use of scalable vectors, override the loop
+  // hints.
+  if (ForceScalableVectorization.getValue() !=
+      LoopVectorizeHints::SK_Unspecified)
+    Scalable.Value = ForceScalableVectorization.getValue();
+
+  // Scalable vectorization is disabled if no preference is specified.
   if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
-    // If the width is set, but the metadata says nothing about the scalable
-    // property, then assume it concerns only a fixed-width UserVF.
-    // If width is not set, the flag takes precedence.
-    Scalable.Value = Width.Value ? SK_FixedWidthOnly : ScalableVectorization;
-  else if (ScalableVectorization == SK_FixedWidthOnly)
-    // If the flag is set to disable any use of scalable vectors, override the
-    // loop hint.
     Scalable.Value = SK_FixedWidthOnly;
 
   if (IsVectorized.Value != 1)
@@ -929,7 +949,7 @@ bool LoopVectorizationLegality::canVectorizeFPMath(
   }));
 }
 
-bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
+bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
   Value *In0 = const_cast<Value *>(V);
   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
   if (!PN)
@@ -938,16 +958,29 @@ bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
   return Inductions.count(PN);
 }
 
-bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
+const InductionDescriptor *
+LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const {
+  if (!isInductionPhi(Phi))
+    return nullptr;
+  auto &ID = getInductionVars().find(Phi)->second;
+  if (ID.getKind() == InductionDescriptor::IK_IntInduction ||
+      ID.getKind() == InductionDescriptor::IK_FpInduction)
+    return &ID;
+  return nullptr;
+}
+
+bool LoopVectorizationLegality::isCastedInductionVariable(
+    const Value *V) const {
   auto *Inst = dyn_cast<Instruction>(V);
   return (Inst && InductionCastsToIgnore.count(Inst));
 }
 
-bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) const {
   return isInductionPhi(V) || isCastedInductionVariable(V);
 }
 
-bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
+bool LoopVectorizationLegality::isFirstOrderRecurrence(
+    const PHINode *Phi) const {
   return FirstOrderRecurrences.count(Phi);
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a7d6609f8c56..71eb39a18d2f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -45,16 +45,17 @@ class VPBuilder {
   VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
 
   VPInstruction *createInstruction(unsigned Opcode,
-                                   ArrayRef<VPValue *> Operands) {
-    VPInstruction *Instr = new VPInstruction(Opcode, Operands);
+                                   ArrayRef<VPValue *> Operands, DebugLoc DL) {
+    VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL);
     if (BB)
       BB->insert(Instr, InsertPt);
     return Instr;
   }
 
   VPInstruction *createInstruction(unsigned Opcode,
-                                   std::initializer_list<VPValue *> Operands) {
-    return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
+                                   std::initializer_list<VPValue *> Operands,
+                                   DebugLoc DL) {
+    return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL);
   }
 
 public:
@@ -123,30 +124,33 @@ public:
   /// its underlying Instruction.
   VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                         Instruction *Inst = nullptr) {
-    VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
+    DebugLoc DL;
+    if (Inst)
+      DL = Inst->getDebugLoc();
+    VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL);
     NewVPInst->setUnderlyingValue(Inst);
     return NewVPInst;
   }
-  VPValue *createNaryOp(unsigned Opcode,
-                        std::initializer_list<VPValue *> Operands,
-                        Instruction *Inst = nullptr) {
-    return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
+  VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+                        DebugLoc DL) {
+    return createInstruction(Opcode, Operands, DL);
   }
 
-  VPValue *createNot(VPValue *Operand) {
-    return createInstruction(VPInstruction::Not, {Operand});
+  VPValue *createNot(VPValue *Operand, DebugLoc DL) {
+    return createInstruction(VPInstruction::Not, {Operand}, DL);
   }
 
-  VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
-    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
+  VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) {
+    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL);
   }
 
-  VPValue *createOr(VPValue *LHS, VPValue *RHS) {
-    return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
+  VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) {
+    return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL);
   }
 
-  VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) {
-    return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal});
+  VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
+                        DebugLoc DL) {
+    return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL);
   }
 
   //===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5ca0adb4242c..4747f34fcc62 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -428,6 +428,8 @@ class GeneratedRTChecks;
 
 namespace llvm {
 
+AnalysisKey ShouldRunExtraVectorPasses::Key;
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -506,8 +508,8 @@ public:
   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
   /// is provided, the integer induction variable will first be truncated to
   /// the corresponding type.
-  void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
-                             VPValue *Def, VPValue *CastDef,
+  void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
+                             Value *Start, TruncInst *Trunc, VPValue *Def,
                              VPTransformState &State);
 
   /// Construct the vector value of a scalarized value \p V one lane at a time.
@@ -534,7 +536,7 @@ public:
 
   /// Returns true if the reordering of FP operations is not allowed, but we are
   /// able to vectorize with strict in-order reductions for the given RdxDesc.
-  bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
+  bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
 
   /// Create a broadcast instruction. This method generates a broadcast
   /// instruction (shuffle) for loop invariant values and for the induction
@@ -619,7 +621,7 @@ protected:
   /// can also be a truncate instruction.
   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
                         const InductionDescriptor &ID, VPValue *Def,
-                        VPValue *CastDef, VPTransformState &State);
+                        VPTransformState &State);
 
   /// Create a vector induction phi node based on an existing scalar one. \p
   /// EntryVal is the value from the original loop that maps to the vector phi
@@ -629,7 +631,6 @@ protected:
   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
                                        Value *Step, Value *Start,
                                        Instruction *EntryVal, VPValue *Def,
-                                       VPValue *CastDef,
                                        VPTransformState &State);
 
   /// Returns true if an instruction \p I should be scalarized instead of
@@ -639,29 +640,6 @@ protected:
   /// Returns true if we should generate a scalar version of \p IV.
   bool needsScalarInduction(Instruction *IV) const;
 
-  /// If there is a cast involved in the induction variable \p ID, which should
-  /// be ignored in the vectorized loop body, this function records the
-  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
-  /// cast. We had already proved that the casted Phi is equal to the uncasted
-  /// Phi in the vectorized loop (under a runtime guard), and therefore
-  /// there is no need to vectorize the cast - the same value can be used in the
-  /// vector loop for both the Phi and the cast.
-  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
-  /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
-  ///
-  /// \p EntryVal is the value from the original loop that maps to the vector
-  /// phi node and is used to distinguish what is the IV currently being
-  /// processed - original one (if \p EntryVal is a phi corresponding to the
-  /// original IV) or the "newly-created" one based on the proof mentioned above
-  /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
-  /// latter case \p EntryVal is a TruncInst and we must not record anything for
-  /// that IV, but it's error-prone to expect callers of this routine to care
-  /// about that, hence this explicit parameter.
-  void recordVectorLoopValueForInductionCast(
-      const InductionDescriptor &ID, const Instruction *EntryVal,
-      Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
-      unsigned Part, unsigned Lane = UINT_MAX);
-
   /// Generate a shuffle sequence that will reverse the vector Vec.
   virtual Value *reverseVector(Value *Vec);
 
@@ -698,7 +676,8 @@ protected:
   /// flags, which can be found from the original scalar operations.
   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
                               const DataLayout &DL,
-                              const InductionDescriptor &ID) const;
+                              const InductionDescriptor &ID,
+                              BasicBlock *VectorHeader) const;
 
   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
   /// vector loop preheader, middle block and scalar preheader. Also
@@ -1728,7 +1707,8 @@ private:
   /// disabled or unsupported, then the scalable part will be equal to
   /// ElementCount::getScalable(0).
   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
-                                           ElementCount UserVF);
+                                           ElementCount UserVF,
+                                           bool FoldTailByMasking);
 
   /// \return the maximized element count based on the targets vector
   /// registers and the loop trip-count, but limited to a maximum safe VF.
@@ -1741,7 +1721,8 @@ private:
   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
                                        unsigned SmallestType,
                                        unsigned WidestType,
-                                       const ElementCount &MaxSafeVF);
+                                       const ElementCount &MaxSafeVF,
+                                       bool FoldTailByMasking);
 
   /// \return the maximum legal scalable VF, based on the safe max number
   /// of elements.
@@ -2356,8 +2337,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
 
 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
     const InductionDescriptor &II, Value *Step, Value *Start,
-    Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
-    VPTransformState &State) {
+    Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
+  IRBuilder<> &Builder = State.Builder;
   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
          "Expected either an induction phi-node or a truncate of it!");
 
@@ -2373,7 +2354,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   }
 
   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
-  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
+  Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
   Value *SteppedStart =
       getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
 
@@ -2394,9 +2375,9 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   Type *StepType = Step->getType();
   Value *RuntimeVF;
   if (Step->getType()->isFloatingPointTy())
-    RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF);
+    RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
   else
-    RuntimeVF = getRuntimeVF(Builder, StepType, VF);
+    RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
 
   // Create a vector splat to use in the induction update.
@@ -2405,8 +2386,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
   //        handle a constant vector splat.
   Value *SplatVF = isa<Constant>(Mul)
-                       ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
-                       : Builder.CreateVectorSplat(VF, Mul);
+                       ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+                       : Builder.CreateVectorSplat(State.VF, Mul);
   Builder.restoreIP(CurrIP);
 
   // We may need to add the step a number of times, depending on the unroll
@@ -2420,8 +2401,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
 
     if (isa<TruncInst>(EntryVal))
       addMetadata(LastInduction, EntryVal);
-    recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
-                                          State, Part);
 
     LastInduction = cast<Instruction>(
         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
@@ -2455,56 +2434,21 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
   return llvm::any_of(IV->users(), isScalarInst);
 }
 
-void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
-    const InductionDescriptor &ID, const Instruction *EntryVal,
-    Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
-    unsigned Part, unsigned Lane) {
-  assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
-         "Expected either an induction phi-node or a truncate of it!");
-
-  // This induction variable is not the phi from the original loop but the
-  // newly-created IV based on the proof that casted Phi is equal to the
-  // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
-  // re-uses the same InductionDescriptor that original IV uses but we don't
-  // have to do any recording in this case - that is done when original IV is
-  // processed.
-  if (isa<TruncInst>(EntryVal))
-    return;
-
-  if (!CastDef) {
-    assert(ID.getCastInsts().empty() &&
-           "there are casts for ID, but no CastDef");
-    return;
-  }
-  assert(!ID.getCastInsts().empty() &&
-         "there is a CastDef, but no casts for ID");
-  // Only the first Cast instruction in the Casts vector is of interest.
-  // The rest of the Casts (if exist) have no uses outside the
-  // induction update chain itself.
-  if (Lane < UINT_MAX)
-    State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
-  else
-    State.set(CastDef, VectorLoopVal, Part);
-}
-
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
-                                                TruncInst *Trunc, VPValue *Def,
-                                                VPValue *CastDef,
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
+                                                const InductionDescriptor &ID,
+                                                Value *Start, TruncInst *Trunc,
+                                                VPValue *Def,
                                                 VPTransformState &State) {
+  IRBuilder<> &Builder = State.Builder;
   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
          "Primary induction variable must have an integer type");
-
-  auto II = Legal->getInductionVars().find(IV);
-  assert(II != Legal->getInductionVars().end() && "IV is not an induction");
-
-  auto ID = II->second;
   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
 
   // The value from the original loop to which we are mapping the new induction
   // variable.
   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
 
-  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+  auto &DL = EntryVal->getModule()->getDataLayout();
 
   // Generate code for the induction step. Note that induction steps are
   // required to be loop-invariant
@@ -2514,7 +2458,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
     if (PSE.getSE()->isSCEVable(IV->getType())) {
       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
       return Exp.expandCodeFor(Step, Step->getType(),
-                               LoopVectorPreHeader->getTerminator());
+                               State.CFG.VectorPreHeader->getTerminator());
     }
     return cast<SCEVUnknown>(Step)->getValue();
   };
@@ -2530,7 +2474,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
                      : Builder.CreateCast(Instruction::SIToFP, Induction,
                                           IV->getType());
-      ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
+      ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
+                                      State.CFG.PrevBB);
       ScalarIV->setName("offset.idx");
     }
     if (Trunc) {
@@ -2548,20 +2493,19 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      assert(!VF.isScalable() && "scalable vectors not yet supported.");
+      assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
       Value *StartIdx;
       if (Step->getType()->isFloatingPointTy())
-        StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part);
+        StartIdx =
+            getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part);
       else
-        StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part);
+        StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
 
       Value *EntryPart =
           getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
       State.set(Def, EntryPart, Part);
       if (Trunc)
         addMetadata(EntryPart, Trunc);
-      recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
-                                            State, Part);
     }
   };
 
@@ -2572,7 +2516,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
 
   // Now do the actual transformations, and start with creating the step value.
   Value *Step = CreateStepValue(ID.getStep());
-  if (VF.isZero() || VF.isScalar()) {
+  if (State.VF.isZero() || State.VF.isScalar()) {
     Value *ScalarIV = CreateScalarIV(Step);
     CreateSplatIV(ScalarIV, Step);
     return;
@@ -2583,8 +2527,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
   // least one user in the loop that is not widened.
   auto NeedsScalarIV = needsScalarInduction(EntryVal);
   if (!NeedsScalarIV) {
-    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
-                                    State);
+    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
     return;
   }
 
@@ -2592,14 +2535,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
   if (!shouldScalarizeInstruction(EntryVal)) {
-    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
-                                    State);
+    createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
     Value *ScalarIV = CreateScalarIV(Step);
     // Create scalar steps that can be used by instructions we will later
     // scalarize. Note that the addition of the scalar steps will not increase
     // the number of instructions in the loop in the common case prior to
     // InstCombine. We will be trading one vector extract for each scalar step.
-    buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
+    buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
     return;
   }
 
@@ -2609,7 +2551,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
   Value *ScalarIV = CreateScalarIV(Step);
   if (!Cost->isScalarEpilogueAllowed())
     CreateSplatIV(ScalarIV, Step);
-  buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
+  buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
 }
 
 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
@@ -2663,10 +2605,11 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
                                            Instruction *EntryVal,
                                            const InductionDescriptor &ID,
-                                           VPValue *Def, VPValue *CastDef,
+                                           VPValue *Def,
                                            VPTransformState &State) {
+  IRBuilder<> &Builder = State.Builder;
   // We shouldn't have to build scalar steps if we aren't vectorizing.
-  assert(VF.isVector() && "VF should be greater than one");
+  assert(State.VF.isVector() && "VF should be greater than one");
   // Get the value type and ensure it and the step have the same integer type.
   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
   assert(ScalarIVTy == Step->getType() &&
@@ -2688,33 +2631,32 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
   // iteration. If EntryVal is uniform, we only need to generate the first
   // lane. Otherwise, we generate all VF values.
   bool IsUniform =
-      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
-  unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
+      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF);
+  unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
   // Compute the scalar steps and save the results in State.
   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
                                      ScalarIVTy->getScalarSizeInBits());
   Type *VecIVTy = nullptr;
   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
-  if (!IsUniform && VF.isScalable()) {
-    VecIVTy = VectorType::get(ScalarIVTy, VF);
-    UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
-    SplatStep = Builder.CreateVectorSplat(VF, Step);
-    SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
+  if (!IsUniform && State.VF.isScalable()) {
+    VecIVTy = VectorType::get(ScalarIVTy, State.VF);
+    UnitStepVec =
+        Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
+    SplatStep = Builder.CreateVectorSplat(State.VF, Step);
+    SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
   }
 
-  for (unsigned Part = 0; Part < UF; ++Part) {
-    Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part);
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
 
-    if (!IsUniform && VF.isScalable()) {
-      auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
+    if (!IsUniform && State.VF.isScalable()) {
+      auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
       if (ScalarIVTy->isFloatingPointTy())
         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
       State.set(Def, Add, Part);
-      recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
-                                            Part);
       // It's useful to record the lane values too for the known minimum number
       // of elements so we do those below. This improves the code quality when
       // trying to extract the first element, for example.
@@ -2728,14 +2670,12 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
       // The step returned by `createStepForVF` is a runtime-evaluated value
       // when VF is scalable. Otherwise, it should be folded into a Constant.
-      assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
+      assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
              "Expected StartIdx to be folded to a constant when VF is not "
              "scalable");
       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
       State.set(Def, Add, VPIteration(Part, Lane));
-      recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
-                                            Part, Lane);
     }
   }
 }
@@ -3023,21 +2963,19 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
   // instruction could feed a poison value to the base address of the widen
   // load/store.
-  if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)
+  if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
     Cloned->dropPoisonGeneratingFlags();
 
   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
                                Builder.GetInsertPoint());
   // Replace the operands of the cloned instructions with their scalar
   // equivalents in the new loop.
-  for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) {
-    auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
+  for (auto &I : enumerate(RepRecipe->operands())) {
     auto InputInstance = Instance;
-    if (!Operand || !OrigLoop->contains(Operand) ||
-        (Cost->isUniformAfterVectorization(Operand, State.VF)))
+    VPValue *Operand = I.value();
+    if (State.Plan->isUniformAfterVectorization(Operand))
       InputInstance.Lane = VPLane::getFirstLane();
-    auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance);
-    Cloned->setOperand(op, NewOp);
+    Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
   }
   addNewMetadata(Cloned, Instr);
 
@@ -3339,7 +3277,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
 
 Value *InnerLoopVectorizer::emitTransformedIndex(
     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
-    const InductionDescriptor &ID) const {
+    const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
 
   SCEVExpander Exp(*SE, DL, "induction");
   auto Step = ID.getStep();
@@ -3382,15 +3320,15 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
   };
 
   // Get a suitable insert point for SCEV expansion. For blocks in the vector
-  // loop, choose the end of the vector loop header (=LoopVectorBody), because
+  // loop, choose the end of the vector loop header (=VectorHeader), because
   // the DomTree is not kept up-to-date for additional blocks generated in the
   // vector loop. By using the header as insertion point, we guarantee that the
   // expanded instructions dominate all their uses.
-  auto GetInsertPoint = [this, &B]() {
+  auto GetInsertPoint = [this, &B, VectorHeader]() {
     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
     if (InsertBB != LoopVectorBody &&
-        LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
-      return LoopVectorBody->getTerminator();
+        LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
+      return VectorHeader->getTerminator();
     return &*B.GetInsertPoint();
   };
 
@@ -3538,7 +3476,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
-      EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+      EndValue =
+          emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
       EndValue->setName("ind.end");
 
       // Compute the end value for the additional bypass (if applicable).
@@ -3549,7 +3488,7 @@ void InnerLoopVectorizer::createInductionResumeValues(
         CRD =
             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
         EndValueFromAdditionalBypass =
-            emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+            emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
         EndValueFromAdditionalBypass->setName("ind.end");
       }
     }
@@ -3623,7 +3562,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
   if (MDNode *LID = OrigLoop->getLoopID())
     L->setLoopID(LID);
 
-  LoopVectorizeHints Hints(L, true, *ORE);
+  LoopVectorizeHints Hints(L, true, *ORE, TTI);
   Hints.setAlreadyVectorized();
 
 #ifdef EXPENSIVE_CHECKS
@@ -3780,7 +3719,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
                              II.getStep()->getType())
               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
       CMO->setName("cast.cmo");
-      Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
+      Value *Escape =
+          emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
       Escape->setName("ind.escape");
       MissingVals[UI] = Escape;
     }
@@ -4573,7 +4513,8 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
   }
 }
 
-bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
+bool InnerLoopVectorizer::useOrderedReductions(
+    const RecurrenceDescriptor &RdxDesc) {
   return Cost->useOrderedReductions(RdxDesc);
 }
 
@@ -4648,8 +4589,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
           Value *Idx = Builder.CreateAdd(
               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
-          Value *SclrGep =
-              emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
+          Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
+                                                DL, II, State.CFG.PrevBB);
           SclrGep->setName("next.gep");
           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
         }
@@ -5368,13 +5309,9 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
 
   // Limit MaxScalableVF by the maximum safe dependence distance.
   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
-  if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
-    unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange)
-                             .getVScaleRangeArgs()
-                             .second;
-    if (VScaleMax > 0)
-      MaxVScale = VScaleMax;
-  }
+  if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
+    MaxVScale =
+        TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
   MaxScalableVF = ElementCount::getScalable(
       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
   if (!MaxScalableVF)
@@ -5386,9 +5323,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
   return MaxScalableVF;
 }
 
-FixedScalableVFPair
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
-                                                 ElementCount UserVF) {
+FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
+    unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5475,12 +5411,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
 
   FixedScalableVFPair Result(ElementCount::getFixed(1),
                              ElementCount::getScalable(0));
-  if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
-                                           WidestType, MaxSafeFixedVF))
+  if (auto MaxVF =
+          getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
+                                  MaxSafeFixedVF, FoldTailByMasking))
     Result.FixedVF = MaxVF;
 
-  if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
-                                           WidestType, MaxSafeScalableVF))
+  if (auto MaxVF =
+          getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
+                                  MaxSafeScalableVF, FoldTailByMasking))
     if (MaxVF.isScalable()) {
       Result.ScalableVF = MaxVF;
       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
@@ -5513,7 +5451,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 
   switch (ScalarEpilogueStatus) {
   case CM_ScalarEpilogueAllowed:
-    return computeFeasibleMaxVF(TC, UserVF);
+    return computeFeasibleMaxVF(TC, UserVF, false);
   case CM_ScalarEpilogueNotAllowedUsePredicate:
     LLVM_FALLTHROUGH;
   case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -5551,7 +5489,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
                            "scalar epilogue instead.\n");
       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-      return computeFeasibleMaxVF(TC, UserVF);
+      return computeFeasibleMaxVF(TC, UserVF, false);
     }
     return FixedScalableVFPair::getNone();
   }
@@ -5568,7 +5506,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
   }
 
-  FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
+  FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
   // Avoid tail folding if the trip count is known to be a multiple of any VF
   // we chose.
   // FIXME: The condition below pessimises the case for fixed-width vectors,
@@ -5641,7 +5579,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 
 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
-    const ElementCount &MaxSafeVF) {
+    const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
   TypeSize WidestRegister = TTI.getRegisterBitWidth(
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
@@ -5673,14 +5611,17 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
   if (ConstTripCount &&
       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
-      isPowerOf2_32(ConstTripCount)) {
-    // We need to clamp the VF to be the ConstTripCount. There is no point in
-    // choosing a higher viable VF as done in the loop below. If
-    // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
-    // the TC is less than or equal to the known number of lanes.
-    LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
-                      << ConstTripCount << "\n");
-    return TripCountEC;
+      (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
+    // If loop trip count (TC) is known at compile time there is no point in
+    // choosing VF greater than TC (as done in the loop below). Select maximum
+    // power of two which doesn't exceed TC.
+    // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
+    // when the TC is less than or equal to the known number of lanes.
+    auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
+    LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
+                         "exceeding the constant trip count: "
+                      << ClampedConstTripCount << "\n");
+    return ElementCount::getFixed(ClampedConstTripCount);
   }
 
   ElementCount MaxVF = MaxVectorElementCount;
@@ -5758,12 +5699,11 @@ bool LoopVectorizationCostModel::isMoreProfitable(
       EstimatedWidthB *= VScale.getValue();
   }
 
-  // When set to preferred, for now assume vscale may be larger than 1 (or the
-  // one being tuned for), so that scalable vectorization is slightly favorable
-  // over fixed-width vectorization.
-  if (Hints->isScalableVectorizationPreferred())
-    if (A.Width.isScalable() && !B.Width.isScalable())
-      return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
+  // Assume vscale may be larger than 1 (or the value being tuned for),
+  // so that scalable vectorization is slightly favorable over fixed-width
+  // vectorization.
+  if (A.Width.isScalable() && !B.Width.isScalable())
+    return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
 
   // To avoid the need for FP division:
   //      (CostA / A.Width) < (CostB / B.Width)
@@ -6068,7 +6008,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
       if (auto *PN = dyn_cast<PHINode>(&I)) {
         if (!Legal->isReductionVariable(PN))
           continue;
-        const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN];
+        const RecurrenceDescriptor &RdxDesc =
+            Legal->getReductionVars().find(PN)->second;
         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
                                       RdxDesc.getRecurrenceType(),
@@ -7002,7 +6943,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
 
   const RecurrenceDescriptor &RdxDesc =
-      Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
+      Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
 
   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
@@ -7079,22 +7020,41 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
         Op0->getOpcode() == Op1->getOpcode() &&
-        Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
       bool IsUnsigned = isa<ZExtInst>(Op0);
-      auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
-      // Matched reduce(mul(ext, ext))
-      InstructionCost ExtCost =
-          TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
-                               TTI::CastContextHint::None, CostKind, Op0);
+      Type *Op0Ty = Op0->getOperand(0)->getType();
+      Type *Op1Ty = Op1->getOperand(0)->getType();
+      Type *LargestOpTy =
+          Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
+                                                                    : Op0Ty;
+      auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
+
+      // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
+      // different sizes. We take the largest type as the ext to reduce, and add
+      // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
+      InstructionCost ExtCost0 = TTI.getCastInstrCost(
+          Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
+          TTI::CastContextHint::None, CostKind, Op0);
+      InstructionCost ExtCost1 = TTI.getCastInstrCost(
+          Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
+          TTI::CastContextHint::None, CostKind, Op1);
       InstructionCost MulCost =
           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
 
       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
           CostKind);
+      InstructionCost ExtraExtCost = 0;
+      if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
+        Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
+        ExtraExtCost = TTI.getCastInstrCost(
+            ExtraExtOp->getOpcode(), ExtType,
+            VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
+            TTI::CastContextHint::None, CostKind, ExtraExtOp);
+      }
 
-      if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
+      if (RedCost.isValid() &&
+          (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
         return I == RetI ? RedCost : 0;
     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
       // Matched reduce(mul())
@@ -7570,8 +7530,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     Type *CondTy = SI->getCondition()->getType();
     if (!ScalarCond)
       CondTy = VectorType::get(CondTy, VF);
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
-                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
+
+    CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+    if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
+      Pred = Cmp->getPredicate();
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
+                                  CostKind, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
@@ -7581,7 +7545,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
     VectorTy = ToVectorTy(ValTy, VF);
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
-                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
+                                  cast<CmpInst>(I)->getPredicate(), CostKind,
+                                  I);
   }
   case Instruction::Store:
   case Instruction::Load: {
@@ -7762,14 +7727,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   // Ignore type-promoting instructions we identified during reduction
   // detection.
   for (auto &Reduction : Legal->getReductionVars()) {
-    RecurrenceDescriptor &RedDes = Reduction.second;
+    const RecurrenceDescriptor &RedDes = Reduction.second;
     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
   // Ignore type-casting instructions we identified during induction
   // detection.
   for (auto &Induction : Legal->getInductionVars()) {
-    InductionDescriptor &IndDes = Induction.second;
+    const InductionDescriptor &IndDes = Induction.second;
     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
@@ -7778,7 +7743,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
 void LoopVectorizationCostModel::collectInLoopReductions() {
   for (auto &Reduction : Legal->getReductionVars()) {
     PHINode *Phi = Reduction.first;
-    RecurrenceDescriptor &RdxDesc = Reduction.second;
+    const RecurrenceDescriptor &RdxDesc = Reduction.second;
 
     // We don't collect reductions that are type promoted (yet).
     if (RdxDesc.getRecurrenceType() != Phi->getType())
@@ -8064,18 +8029,6 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
         }))
       DeadInstructions.insert(IndUpdate);
-
-    // We record as "Dead" also the type-casting instructions we had identified
-    // during induction analysis. We don't need any handling for them in the
-    // vectorized loop because we have proven that, under a proper runtime
-    // test guarding the vectorized loop, the value of the phi, and the casted
-    // value of the phi, are the same. The last instruction in this casting chain
-    // will get its scalar/vector/widened def from the scalar/vector/widened def
-    // of the respective phi node. Any other casts in the induction def-use chain
-    // have no other uses outside the phi update chain, and will be ignored.
-    InductionDescriptor &IndDes = Induction.second;
-    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
-    DeadInstructions.insert(Casts.begin(), Casts.end());
   }
 }
 
@@ -8461,7 +8414,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
   assert(EdgeMask && "No Edge Mask found for condition");
 
   if (BI->getSuccessor(0) != Dst)
-    EdgeMask = Builder.createNot(EdgeMask);
+    EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
 
   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
     // The condition is 'SrcMask && EdgeMask', which is equivalent to
@@ -8470,7 +8423,8 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
     VPValue *False = Plan->getOrAddVPValue(
         ConstantInt::getFalse(BI->getCondition()->getType()));
-    EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
+    EdgeMask =
+        Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
   }
 
   return EdgeMaskCache[Edge] = EdgeMask;
@@ -8492,22 +8446,24 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
     if (!CM.blockNeedsPredicationForAnyReason(BB))
       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
 
-    // Create the block in mask as the first non-phi instruction in the block.
-    VPBuilder::InsertPointGuard Guard(Builder);
-    auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
-    Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
-
     // Introduce the early-exit compare IV <= BTC to form header block mask.
     // This is used instead of IV < TC because TC may wrap, unlike BTC.
-    // Start by constructing the desired canonical IV.
+    // Start by constructing the desired canonical IV in the header block.
     VPValue *IV = nullptr;
     if (Legal->getPrimaryInduction())
       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
     else {
+      VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
       auto *IVRecipe = new VPWidenCanonicalIVRecipe();
-      Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
+      HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi());
       IV = IVRecipe;
     }
+
+    // Create the block in mask as the first non-phi instruction in the block.
+    VPBuilder::InsertPointGuard Guard(Builder);
+    auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
+    Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
+
     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
     bool TailFolded = !CM.isScalarEpilogueAllowed();
 
@@ -8534,7 +8490,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
       continue;
     }
 
-    BlockMask = Builder.createOr(BlockMask, EdgeMask);
+    BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
   }
 
   return BlockMaskCache[BB] = BlockMask;
@@ -8591,14 +8547,10 @@ VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
                                            ArrayRef<VPValue *> Operands) const {
   // Check if this is an integer or fp induction. If so, build the recipe that
   // produces its scalar and vector values.
-  InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
-  if (II.getKind() == InductionDescriptor::IK_IntInduction ||
-      II.getKind() == InductionDescriptor::IK_FpInduction) {
-    assert(II.getStartValue() ==
+  if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
+    assert(II->getStartValue() ==
            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
-    const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
-    return new VPWidenIntOrFpInductionRecipe(
-        Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());
+    return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
   }
 
   return nullptr;
@@ -8624,11 +8576,10 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
   if (LoopVectorizationPlanner::getDecisionAndClampRange(
           isOptimizableIVTruncate(I), Range)) {
 
-    InductionDescriptor II =
-        Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
+    auto *Phi = cast<PHINode>(I->getOperand(0));
+    const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
-    return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
-                                             Start, nullptr, I);
+    return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I);
   }
   return nullptr;
 }
@@ -8844,13 +8795,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
     return VPBB;
   }
   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
-  assert(VPBB->getSuccessors().empty() &&
-         "VPBB has successors when handling predicated replication.");
+
+  VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
+  assert(SingleSucc && "VPBB must have a single successor when handling "
+                       "predicated replication.");
+  VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
   // Record predicated instructions for above packing optimizations.
   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
   VPBlockUtils::insertBlockAfter(Region, VPBB);
   auto *RegSucc = new VPBasicBlock();
   VPBlockUtils::insertBlockAfter(RegSucc, Region);
+  VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
   return RegSucc;
 }
 
@@ -8910,7 +8865,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
       VPValue *StartV = Operands[0];
       if (Legal->isReductionVariable(Phi)) {
-        RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+        const RecurrenceDescriptor &RdxDesc =
+            Legal->getReductionVars().find(Phi)->second;
         assert(RdxDesc.getRecurrenceStartValue() ==
                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
@@ -9031,7 +8987,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   }
   for (auto &Reduction : CM.getInLoopReductionChains()) {
     PHINode *Phi = Reduction.first;
-    RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
+    RecurKind Kind =
+        Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
 
     RecipeBuilder.recordRecipeOf(Phi);
@@ -9069,30 +9026,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   // visit each basic block after having visited its predecessor basic blocks.
   // ---------------------------------------------------------------------------
 
-  auto Plan = std::make_unique<VPlan>();
+  // Create initial VPlan skeleton, with separate header and latch blocks.
+  VPBasicBlock *HeaderVPBB = new VPBasicBlock();
+  VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
+  VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
+  auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
+  auto Plan = std::make_unique<VPlan>(TopRegion);
 
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
   LoopBlocksDFS DFS(OrigLoop);
   DFS.perform(LI);
 
-  VPBasicBlock *VPBB = nullptr;
-  VPBasicBlock *HeaderVPBB = nullptr;
+  VPBasicBlock *VPBB = HeaderVPBB;
   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
     // Relevant instructions from basic block BB will be grouped into VPRecipe
     // ingredients and fill a new VPBasicBlock.
     unsigned VPBBsForBB = 0;
-    auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
-    if (VPBB)
-      VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
-    else {
-      auto *TopRegion = new VPRegionBlock("vector loop");
-      TopRegion->setEntry(FirstVPBBForBB);
-      Plan->setEntry(TopRegion);
-      HeaderVPBB = FirstVPBBForBB;
-    }
-    VPBB = FirstVPBBForBB;
+    VPBB->setName(BB->getName());
     Builder.setInsertPoint(VPBB);
 
     // Introduce each ingredient into VPlan.
@@ -9159,13 +9111,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
                                     : "");
       }
     }
+
+    VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
+    VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
   }
 
+  // Fold the last, empty block into its predecessor.
+  VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
+  assert(VPBB && "expected to fold last (empty) block");
+  // After here, VPBB should not be used.
+  VPBB = nullptr;
+
   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
          "entry block must be set to a VPRegionBlock having a non-empty entry "
          "VPBasicBlock");
-  cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
   RecipeBuilder.fixHeaderPhis();
 
   // ---------------------------------------------------------------------------
@@ -9231,18 +9191,19 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
-      if (VPBB == SplitPred)
-        VPBB = SplitBlock;
     }
   }
 
+  VPlanTransforms::removeRedundantInductionCasts(*Plan);
+
   // Now that sink-after is done, move induction recipes for optimized truncates
   // to the phi section of the header block.
   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
 
   // Adjust the recipes for any inloop reductions.
-  adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
+  adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
+                             RecipeBuilder, Range.Start);
 
   // Introduce a recipe to combine the incoming and previous values of a
   // first-order recurrence.
@@ -9322,6 +9283,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   RSO.flush();
   Plan->setName(PlanName);
 
+  // Fold Exit block into its predecessor if possible.
+  // TODO: Fold block earlier once all VPlan transforms properly maintain a
+  // VPBasicBlock as exit.
+  VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
+
   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
 }
@@ -9355,9 +9321,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   }
 
   SmallPtrSet<Instruction *, 1> DeadInstructions;
-  VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
-                                             Legal->getInductionVars(),
-                                             DeadInstructions, *PSE.getSE());
+  VPlanTransforms::VPInstructionsToVPRecipes(
+      OrigLoop, Plan,
+      [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
+      DeadInstructions, *PSE.getSE());
   return Plan;
 }
 
@@ -9371,7 +9338,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     ElementCount MinVF) {
   for (auto &Reduction : CM.getInLoopReductionChains()) {
     PHINode *Phi = Reduction.first;
-    RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+    const RecurrenceDescriptor &RdxDesc =
+        Legal->getReductionVars().find(Phi)->second;
     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
 
     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
@@ -9565,7 +9533,7 @@ void VPWidenRecipe::execute(VPTransformState &State) {
         // exact, etc.). The control flow has been linearized and the
         // instruction is no longer guarded by the predicate, which could make
         // the flag properties to no longer hold.
-        if (State.MayGeneratePoisonRecipes.count(this) > 0)
+        if (State.MayGeneratePoisonRecipes.contains(this))
           VecOp->dropPoisonGeneratingFlags();
       }
 
@@ -9714,9 +9682,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
 
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Int or FP induction being replicated.");
-  State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
-                                   getTruncInst(), getVPValue(0),
-                                   getCastValue(), State);
+  State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
+                                   getStartValue()->getLiveInIRValue(),
+                                   getTruncInst(), getVPValue(0), State);
 }
 
 void VPWidenPHIRecipe::execute(VPTransformState &State) {
@@ -10293,7 +10261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                     << L->getHeader()->getParent()->getName() << "\" from "
                     << DebugLocStr << "\n");
 
-  LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
+  LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
 
   LLVM_DEBUG(
       dbgs() << "LV: Loop hints:"
@@ -10747,8 +10715,17 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
       PA.preserve<LoopAnalysis>();
       PA.preserve<DominatorTreeAnalysis>();
     }
-    if (!Result.MadeCFGChange)
+
+    if (Result.MadeCFGChange) {
+      // Making CFG changes likely means a loop got vectorized. Indicate that
+      // extra simplification passes should be run.
+      // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
+      // be run if runtime checks have been added.
+      AM.getResult<ShouldRunExtraVectorPasses>(F);
+      PA.preserve<ShouldRunExtraVectorPasses>();
+    } else {
       PA.preserveSet<CFGAnalyses>();
+    }
     return PA;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 95061e9053fa..37ae13666f7a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -631,27 +631,26 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
 /// after:   6 3 5 4 7 2 1 0
 static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {
   const unsigned Sz = Order.size();
-  SmallBitVector UsedIndices(Sz);
-  SmallVector<int> MaskedIndices;
+  SmallBitVector UnusedIndices(Sz, /*t=*/true);
+  SmallBitVector MaskedIndices(Sz);
   for (unsigned I = 0; I < Sz; ++I) {
     if (Order[I] < Sz)
-      UsedIndices.set(Order[I]);
+      UnusedIndices.reset(Order[I]);
     else
-      MaskedIndices.push_back(I);
+      MaskedIndices.set(I);
   }
-  if (MaskedIndices.empty())
+  if (MaskedIndices.none())
     return;
-  SmallVector<int> AvailableIndices(MaskedIndices.size());
-  unsigned Cnt = 0;
-  int Idx = UsedIndices.find_first();
-  do {
-    AvailableIndices[Cnt] = Idx;
-    Idx = UsedIndices.find_next(Idx);
-    ++Cnt;
-  } while (Idx > 0);
-  assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");
-  for (int I = 0, E = MaskedIndices.size(); I < E; ++I)
-    Order[MaskedIndices[I]] = AvailableIndices[I];
+  assert(UnusedIndices.count() == MaskedIndices.count() &&
+         "Non-synced masked/available indices.");
+  int Idx = UnusedIndices.find_first();
+  int MIdx = MaskedIndices.find_first();
+  while (MIdx >= 0) {
+    assert(Idx >= 0 && "Indices must be synced.");
+    Order[MIdx] = Idx;
+    Idx = UnusedIndices.find_next(Idx);
+    MIdx = MaskedIndices.find_next(MIdx);
+  }
 }
 
 namespace llvm {
@@ -812,6 +811,13 @@ public:
   /// ExtractElement, ExtractValue), which can be part of the graph.
   Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
 
+  /// Gets reordering data for the given tree entry. If the entry is vectorized
+  /// - just return ReorderIndices, otherwise check if the scalars can be
+  /// reordered and return the most optimal order.
+  /// \param TopToBottom If true, include the order of vectorized stores and
+  /// insertelement nodes, otherwise skip them.
+  Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);
+
   /// Reorders the current graph to the most profitable order starting from the
   /// root node to the leaf nodes. The best order is chosen only from the nodes
   /// of the same size (vectorization factor). Smaller nodes are considered
@@ -1010,18 +1016,25 @@ public:
       std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
     }
 
-    // The hard-coded scores listed here are not very important. When computing
-    // the scores of matching one sub-tree with another, we are basically
-    // counting the number of values that are matching. So even if all scores
-    // are set to 1, we would still get a decent matching result.
+    // The hard-coded scores listed here are not very important, though it shall
+    // be higher for better matches to improve the resulting cost. When
+    // computing the scores of matching one sub-tree with another, we are
+    // basically counting the number of values that are matching. So even if all
+    // scores are set to 1, we would still get a decent matching result.
     // However, sometimes we have to break ties. For example we may have to
     // choose between matching loads vs matching opcodes. This is what these
-    // scores are helping us with: they provide the order of preference.
+    // scores are helping us with: they provide the order of preference. Also,
+    // this is important if the scalar is externally used or used in another
+    // tree entry node in the different lane.
 
     /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
-    static const int ScoreConsecutiveLoads = 3;
+    static const int ScoreConsecutiveLoads = 4;
+    /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
+    static const int ScoreReversedLoads = 3;
     /// ExtractElementInst from same vector and consecutive indexes.
-    static const int ScoreConsecutiveExtracts = 3;
+    static const int ScoreConsecutiveExtracts = 4;
+    /// ExtractElementInst from same vector and reversed indices.
+    static const int ScoreReversedExtracts = 3;
     /// Constants.
     static const int ScoreConstants = 2;
     /// Instructions with the same opcode.
@@ -1041,7 +1054,10 @@ public:
 
     /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
     static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
-                               ScalarEvolution &SE) {
+                               ScalarEvolution &SE, int NumLanes) {
+      if (V1 == V2)
+        return VLOperands::ScoreSplat;
+
       auto *LI1 = dyn_cast<LoadInst>(V1);
       auto *LI2 = dyn_cast<LoadInst>(V2);
       if (LI1 && LI2) {
@@ -1051,8 +1067,17 @@ public:
         Optional<int> Dist = getPointersDiff(
             LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
             LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
-        return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads
-                                    : VLOperands::ScoreFail;
+        if (!Dist)
+          return VLOperands::ScoreFail;
+        // The distance is too large - still may be profitable to use masked
+        // loads/gathers.
+        if (std::abs(*Dist) > NumLanes / 2)
+          return VLOperands::ScoreAltOpcodes;
+        // This still will detect consecutive loads, but we might have "holes"
+        // in some cases. It is ok for non-power-2 vectorization and may produce
+        // better results. It should not affect current vectorization.
+        return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads
+                           : VLOperands::ScoreReversedLoads;
       }
 
       auto *C1 = dyn_cast<Constant>(V1);
@@ -1062,18 +1087,41 @@ public:
 
       // Extracts from consecutive indexes of the same vector better score as
       // the extracts could be optimized away.
-      Value *EV;
-      ConstantInt *Ex1Idx, *Ex2Idx;
-      if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
-          match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
-          Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
-        return VLOperands::ScoreConsecutiveExtracts;
+      Value *EV1;
+      ConstantInt *Ex1Idx;
+      if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
+        // Undefs are always profitable for extractelements.
+        if (isa<UndefValue>(V2))
+          return VLOperands::ScoreConsecutiveExtracts;
+        Value *EV2 = nullptr;
+        ConstantInt *Ex2Idx = nullptr;
+        if (match(V2,
+                  m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),
+                                                         m_Undef())))) {
+          // Undefs are always profitable for extractelements.
+          if (!Ex2Idx)
+            return VLOperands::ScoreConsecutiveExtracts;
+          if (isUndefVector(EV2) && EV2->getType() == EV1->getType())
+            return VLOperands::ScoreConsecutiveExtracts;
+          if (EV2 == EV1) {
+            int Idx1 = Ex1Idx->getZExtValue();
+            int Idx2 = Ex2Idx->getZExtValue();
+            int Dist = Idx2 - Idx1;
+            // The distance is too large - still may be profitable to use
+            // shuffles.
+            if (std::abs(Dist) > NumLanes / 2)
+              return VLOperands::ScoreAltOpcodes;
+            return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts
+                              : VLOperands::ScoreReversedExtracts;
+          }
+        }
+      }
 
       auto *I1 = dyn_cast<Instruction>(V1);
       auto *I2 = dyn_cast<Instruction>(V2);
       if (I1 && I2) {
-        if (I1 == I2)
-          return VLOperands::ScoreSplat;
+        if (I1->getParent() != I2->getParent())
+          return VLOperands::ScoreFail;
         InstructionsState S = getSameOpcode({I1, I2});
         // Note: Only consider instructions with <= 2 operands to avoid
         // complexity explosion.
@@ -1088,11 +1136,13 @@ public:
       return VLOperands::ScoreFail;
     }
 
-    /// Holds the values and their lane that are taking part in the look-ahead
+    /// Holds the values and their lanes that are taking part in the look-ahead
     /// score calculation. This is used in the external uses cost calculation.
-    SmallDenseMap<Value *, int> InLookAheadValues;
+    /// Need to hold all the lanes in case of splat/broadcast at least to
+    /// correctly check for the use in the different lane.
+    SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues;
 
-    /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
+    /// \returns the additional cost due to uses of \p LHS and \p RHS that are
     /// either external to the vectorized code, or require shuffling.
     int getExternalUsesCost(const std::pair<Value *, int> &LHS,
                             const std::pair<Value *, int> &RHS) {
@@ -1116,22 +1166,30 @@ public:
         for (User *U : V->users()) {
           if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
             // The user is in the VectorizableTree. Check if we need to insert.
-            auto It = llvm::find(UserTE->Scalars, U);
-            assert(It != UserTE->Scalars.end() && "U is in UserTE");
-            int UserLn = std::distance(UserTE->Scalars.begin(), It);
+            int UserLn = UserTE->findLaneForValue(U);
             assert(UserLn >= 0 && "Bad lane");
-            if (UserLn != Ln)
+            // If the values are different, check just the line of the current
+            // value. If the values are the same, need to add UserInDiffLaneCost
+            // only if UserLn does not match both line numbers.
+            if ((LHS.first != RHS.first && UserLn != Ln) ||
+                (LHS.first == RHS.first && UserLn != LHS.second &&
+                 UserLn != RHS.second)) {
               Cost += UserInDiffLaneCost;
+              break;
+            }
           } else {
             // Check if the user is in the look-ahead code.
             auto It2 = InLookAheadValues.find(U);
             if (It2 != InLookAheadValues.end()) {
               // The user is in the look-ahead code. Check the lane.
-              if (It2->second != Ln)
+              if (!It2->getSecond().contains(Ln)) {
                 Cost += UserInDiffLaneCost;
+                break;
+              }
             } else {
               // The user is neither in SLP tree nor in the look-ahead code.
               Cost += ExternalUseCost;
+              break;
             }
           }
           // Limit the number of visited uses to cap compilation time.
@@ -1170,32 +1228,36 @@ public:
       Value *V1 = LHS.first;
       Value *V2 = RHS.first;
       // Get the shallow score of V1 and V2.
-      int ShallowScoreAtThisLevel =
-          std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
-                                       getExternalUsesCost(LHS, RHS));
+      int ShallowScoreAtThisLevel = std::max(
+          (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) -
+                              getExternalUsesCost(LHS, RHS));
       int Lane1 = LHS.second;
       int Lane2 = RHS.second;
 
       // If reached MaxLevel,
       //  or if V1 and V2 are not instructions,
       //  or if they are SPLAT,
-      //  or if they are not consecutive, early return the current cost.
+      //  or if they are not consecutive,
+      //  or if profitable to vectorize loads or extractelements, early return
+      //  the current cost.
       auto *I1 = dyn_cast<Instruction>(V1);
       auto *I2 = dyn_cast<Instruction>(V2);
       if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
           ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
-          (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
+          (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
+            (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
+           ShallowScoreAtThisLevel))
         return ShallowScoreAtThisLevel;
       assert(I1 && I2 && "Should have early exited.");
 
       // Keep track of in-tree values for determining the external-use cost.
-      InLookAheadValues[V1] = Lane1;
-      InLookAheadValues[V2] = Lane2;
+      InLookAheadValues[V1].insert(Lane1);
+      InLookAheadValues[V2].insert(Lane2);
 
       // Contains the I2 operand indexes that got matched with I1 operands.
       SmallSet<unsigned, 4> Op2Used;
 
-      // Recursion towards the operands of I1 and I2. We are trying all possbile
+      // Recursion towards the operands of I1 and I2. We are trying all possible
       // operand pairs, and keeping track of the best score.
       for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
            OpIdx1 != NumOperands1; ++OpIdx1) {
@@ -1319,27 +1381,79 @@ public:
       return None;
     }
 
-    /// Helper for reorderOperandVecs. \Returns the lane that we should start
-    /// reordering from. This is the one which has the least number of operands
-    /// that can freely move about.
+    /// Helper for reorderOperandVecs.
+    /// \returns the lane that we should start reordering from. This is the one
+    /// which has the least number of operands that can freely move about or
+    /// less profitable because it already has the most optimal set of operands.
     unsigned getBestLaneToStartReordering() const {
-      unsigned BestLane = 0;
       unsigned Min = UINT_MAX;
-      for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
-           ++Lane) {
-        unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
-        if (NumFreeOps < Min) {
-          Min = NumFreeOps;
-          BestLane = Lane;
+      unsigned SameOpNumber = 0;
+      // std::pair<unsigned, unsigned> is used to implement a simple voting
+      // algorithm and choose the lane with the least number of operands that
+      // can freely move about or less profitable because it already has the
+      // most optimal set of operands. The first unsigned is a counter for
+      // voting, the second unsigned is the counter of lanes with instructions
+      // with same/alternate opcodes and same parent basic block.
+      MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
+      // Try to be closer to the original results, if we have multiple lanes
+      // with same cost. If 2 lanes have the same cost, use the one with the
+      // lowest index.
+      for (int I = getNumLanes(); I > 0; --I) {
+        unsigned Lane = I - 1;
+        OperandsOrderData NumFreeOpsHash =
+            getMaxNumOperandsThatCanBeReordered(Lane);
+        // Compare the number of operands that can move and choose the one with
+        // the least number.
+        if (NumFreeOpsHash.NumOfAPOs < Min) {
+          Min = NumFreeOpsHash.NumOfAPOs;
+          SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
+          HashMap.clear();
+          HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
+        } else if (NumFreeOpsHash.NumOfAPOs == Min &&
+                   NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
+          // Select the most optimal lane in terms of number of operands that
+          // should be moved around.
+          SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
+          HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
+        } else if (NumFreeOpsHash.NumOfAPOs == Min &&
+                   NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
+          ++HashMap[NumFreeOpsHash.Hash].first;
+        }
+      }
+      // Select the lane with the minimum counter.
+      unsigned BestLane = 0;
+      unsigned CntMin = UINT_MAX;
+      for (const auto &Data : reverse(HashMap)) {
+        if (Data.second.first < CntMin) {
+          CntMin = Data.second.first;
+          BestLane = Data.second.second;
         }
       }
       return BestLane;
     }
 
-    /// \Returns the maximum number of operands that are allowed to be reordered
-    /// for \p Lane. This is used as a heuristic for selecting the first lane to
-    /// start operand reordering.
-    unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
+    /// Data structure that helps to reorder operands.
+    struct OperandsOrderData {
+      /// The best number of operands with the same APOs, which can be
+      /// reordered.
+      unsigned NumOfAPOs = UINT_MAX;
+      /// Number of operands with the same/alternate instruction opcode and
+      /// parent.
+      unsigned NumOpsWithSameOpcodeParent = 0;
+      /// Hash for the actual operands ordering.
+      /// Used to count operands, actually their position id and opcode
+      /// value. It is used in the voting mechanism to find the lane with the
+      /// least number of operands that can freely move about or less profitable
+      /// because it already has the most optimal set of operands. Can be
+      /// replaced with SmallVector<unsigned> instead but hash code is faster
+      /// and requires less memory.
+      unsigned Hash = 0;
+    };
+    /// \returns the maximum number of operands that are allowed to be reordered
+    /// for \p Lane and the number of compatible instructions(with the same
+    /// parent/opcode). This is used as a heuristic for selecting the first lane
+    /// to start operand reordering.
+    OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
       unsigned CntTrue = 0;
       unsigned NumOperands = getNumOperands();
       // Operands with the same APO can be reordered. We therefore need to count
@@ -1348,11 +1462,45 @@ public:
       // a map. Instead we can simply count the number of operands that
       // correspond to one of them (in this case the 'true' APO), and calculate
       // the other by subtracting it from the total number of operands.
-      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
-        if (getData(OpIdx, Lane).APO)
+      // Operands with the same instruction opcode and parent are more
+      // profitable since we don't need to move them in many cases, with a high
+      // probability such lane already can be vectorized effectively.
+      bool AllUndefs = true;
+      unsigned NumOpsWithSameOpcodeParent = 0;
+      Instruction *OpcodeI = nullptr;
+      BasicBlock *Parent = nullptr;
+      unsigned Hash = 0;
+      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+        const OperandData &OpData = getData(OpIdx, Lane);
+        if (OpData.APO)
           ++CntTrue;
-      unsigned CntFalse = NumOperands - CntTrue;
-      return std::max(CntTrue, CntFalse);
+        // Use Boyer-Moore majority voting for finding the majority opcode and
+        // the number of times it occurs.
+        if (auto *I = dyn_cast<Instruction>(OpData.V)) {
+          if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() ||
+              I->getParent() != Parent) {
+            if (NumOpsWithSameOpcodeParent == 0) {
+              NumOpsWithSameOpcodeParent = 1;
+              OpcodeI = I;
+              Parent = I->getParent();
+            } else {
+              --NumOpsWithSameOpcodeParent;
+            }
+          } else {
+            ++NumOpsWithSameOpcodeParent;
+          }
+        }
+        Hash = hash_combine(
+            Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
+        AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
+      }
+      if (AllUndefs)
+        return {};
+      OperandsOrderData Data;
+      Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
+      Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
+      Data.Hash = Hash;
+      return Data;
     }
 
     /// Go through the instructions in VL and append their operands.
@@ -1500,11 +1648,37 @@ public:
           ReorderingModes[OpIdx] = ReorderingMode::Failed;
       }
 
+      // Check that we don't have same operands. No need to reorder if operands
+      // are just perfect diamond or shuffled diamond match. Do not do it only
+      // for possible broadcasts or non-power of 2 number of scalars (just for
+      // now).
+      auto &&SkipReordering = [this]() {
+        SmallPtrSet<Value *, 4> UniqueValues;
+        ArrayRef<OperandData> Op0 = OpsVec.front();
+        for (const OperandData &Data : Op0)
+          UniqueValues.insert(Data.V);
+        for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
+          if (any_of(Op, [&UniqueValues](const OperandData &Data) {
+                return !UniqueValues.contains(Data.V);
+              }))
+            return false;
+        }
+        // TODO: Check if we can remove a check for non-power-2 number of
+        // scalars after full support of non-power-2 vectorization.
+        return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
+      };
+
       // If the initial strategy fails for any of the operand indexes, then we
       // perform reordering again in a second pass. This helps avoid assigning
       // high priority to the failed strategy, and should improve reordering for
       // the non-failed operand indexes.
       for (int Pass = 0; Pass != 2; ++Pass) {
+        // Check if no need to reorder operands since they're are perfect or
+        // shuffled diamond match.
+        // Need to to do it to avoid extra external use cost counting for
+        // shuffled matches, which may cause regressions.
+        if (SkipReordering())
+          break;
         // Skip the second pass if the first pass did not fail.
         bool StrategyFailed = false;
         // Mark all operand data as free to use.
@@ -1792,9 +1966,10 @@ private:
       if (Operands.size() < OpIdx + 1)
         Operands.resize(OpIdx + 1);
       assert(Operands[OpIdx].empty() && "Already resized?");
-      Operands[OpIdx].resize(Scalars.size());
-      for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
-        Operands[OpIdx][Lane] = OpVL[Lane];
+      assert(OpVL.size() <= Scalars.size() &&
+             "Number of operands is greater than the number of scalars.");
+      Operands[OpIdx].resize(OpVL.size());
+      copy(OpVL, Operands[OpIdx].begin());
     }
 
     /// Set the operands of this bundle in their original order.
@@ -1944,7 +2119,7 @@ private:
       if (ReuseShuffleIndices.empty())
         dbgs() << "Empty";
       else
-        for (unsigned ReuseIdx : ReuseShuffleIndices)
+        for (int ReuseIdx : ReuseShuffleIndices)
           dbgs() << ReuseIdx << ", ";
       dbgs() << "\n";
       dbgs() << "ReorderIndices: ";
@@ -2819,6 +2994,50 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
   return None;
 }
 
+Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
+                                                         bool TopToBottom) {
+  // No need to reorder if need to shuffle reuses, still need to shuffle the
+  // node.
+  if (!TE.ReuseShuffleIndices.empty())
+    return None;
+  if (TE.State == TreeEntry::Vectorize &&
+      (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
+       (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
+      !TE.isAltShuffle())
+    return TE.ReorderIndices;
+  if (TE.State == TreeEntry::NeedToGather) {
+    // TODO: add analysis of other gather nodes with extractelement
+    // instructions and other values/instructions, not only undefs.
+    if (((TE.getOpcode() == Instruction::ExtractElement &&
+          !TE.isAltShuffle()) ||
+         (all_of(TE.Scalars,
+                 [](Value *V) {
+                   return isa<UndefValue, ExtractElementInst>(V);
+                 }) &&
+          any_of(TE.Scalars,
+                 [](Value *V) { return isa<ExtractElementInst>(V); }))) &&
+        all_of(TE.Scalars,
+               [](Value *V) {
+                 auto *EE = dyn_cast<ExtractElementInst>(V);
+                 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
+               }) &&
+        allSameType(TE.Scalars)) {
+      // Check that gather of extractelements can be represented as
+      // just a shuffle of a single vector.
+      OrdersType CurrentOrder;
+      bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
+      if (Reuse || !CurrentOrder.empty()) {
+        if (!CurrentOrder.empty())
+          fixupOrderingIndices(CurrentOrder);
+        return CurrentOrder;
+      }
+    }
+    if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
+      return CurrentOrder;
+  }
+  return None;
+}
+
 void BoUpSLP::reorderTopToBottom() {
   // Maps VF to the graph nodes.
   DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries;
@@ -2826,42 +3045,15 @@ void BoUpSLP::reorderTopToBottom() {
   // their ordering.
   DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
   // Find all reorderable nodes with the given VF.
-  // Currently the are vectorized loads,extracts + some gathering of extracts.
+  // Currently the are vectorized stores,loads,extracts + some gathering of
+  // extracts.
   for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders](
                                  const std::unique_ptr<TreeEntry> &TE) {
-    // No need to reorder if need to shuffle reuses, still need to shuffle the
-    // node.
-    if (!TE->ReuseShuffleIndices.empty())
-      return;
-    if (TE->State == TreeEntry::Vectorize &&
-        isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst,
-            InsertElementInst>(TE->getMainOp()) &&
-        !TE->isAltShuffle()) {
+    if (Optional<OrdersType> CurrentOrder =
+            getReorderingData(*TE.get(), /*TopToBottom=*/true)) {
       VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
-      return;
-    }
-    if (TE->State == TreeEntry::NeedToGather) {
-      if (TE->getOpcode() == Instruction::ExtractElement &&
-          !TE->isAltShuffle() &&
-          isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
-                                   ->getVectorOperandType()) &&
-          allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
-        // Check that gather of extractelements can be represented as
-        // just a shuffle of a single vector.
-        OrdersType CurrentOrder;
-        bool Reuse =
-            canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
-        if (Reuse || !CurrentOrder.empty()) {
-          VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
-          GathersToOrders.try_emplace(TE.get(), CurrentOrder);
-          return;
-        }
-      }
-      if (Optional<OrdersType> CurrentOrder =
-              findReusedOrderedScalars(*TE.get())) {
-        VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
+      if (TE->State != TreeEntry::Vectorize)
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
-      }
     }
   });
 
@@ -2993,44 +3185,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
                                  const std::unique_ptr<TreeEntry> &TE) {
     if (TE->State != TreeEntry::Vectorize)
       NonVectorized.push_back(TE.get());
-    // No need to reorder if need to shuffle reuses, still need to shuffle the
-    // node.
-    if (!TE->ReuseShuffleIndices.empty())
-      return;
-    if (TE->State == TreeEntry::Vectorize &&
-        isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) &&
-        !TE->isAltShuffle()) {
+    if (Optional<OrdersType> CurrentOrder =
+            getReorderingData(*TE.get(), /*TopToBottom=*/false)) {
       OrderedEntries.insert(TE.get());
-      return;
-    }
-    if (TE->State == TreeEntry::NeedToGather) {
-      if (TE->getOpcode() == Instruction::ExtractElement &&
-          !TE->isAltShuffle() &&
-          isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp())
-                                   ->getVectorOperandType()) &&
-          allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) {
-        // Check that gather of extractelements can be represented as
-        // just a shuffle of a single vector with a single user only.
-        OrdersType CurrentOrder;
-        bool Reuse =
-            canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder);
-        if ((Reuse || !CurrentOrder.empty()) &&
-            !any_of(VectorizableTree,
-                    [&TE](const std::unique_ptr<TreeEntry> &Entry) {
-                      return Entry->State == TreeEntry::NeedToGather &&
-                             Entry.get() != TE.get() &&
-                             Entry->isSame(TE->Scalars);
-                    })) {
-          OrderedEntries.insert(TE.get());
-          GathersToOrders.try_emplace(TE.get(), CurrentOrder);
-          return;
-        }
-      }
-      if (Optional<OrdersType> CurrentOrder =
-              findReusedOrderedScalars(*TE.get())) {
-        OrderedEntries.insert(TE.get());
+      if (TE->State != TreeEntry::Vectorize)
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
-      }
     }
   });
 
@@ -3392,9 +3551,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     // Check that every instruction appears once in this bundle.
     DenseMap<Value *, unsigned> UniquePositions;
     for (Value *V : VL) {
+      if (isConstant(V)) {
+        ReuseShuffleIndicies.emplace_back(
+            isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());
+        UniqueValues.emplace_back(V);
+        continue;
+      }
       auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
-      ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1
-                                                           : Res.first->second);
+      ReuseShuffleIndicies.emplace_back(Res.first->second);
       if (Res.second)
         UniqueValues.emplace_back(V);
     }
@@ -3404,6 +3568,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     } else {
       LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
       if (NumUniqueScalarValues <= 1 ||
+          (UniquePositions.size() == 1 && all_of(UniqueValues,
+                                                 [](Value *V) {
+                                                   return isa<UndefValue>(V) ||
+                                                          !isConstant(V);
+                                                 })) ||
           !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
         LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
         newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
@@ -3508,11 +3677,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
   }
 
-  // If any of the scalars is marked as a value that needs to stay scalar, then
-  // we need to gather the scalars.
   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
   for (Value *V : VL) {
-    if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
+    if (is_contained(UserIgnoreList, V)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
       if (TryToFindDuplicates(S))
         newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
@@ -4219,10 +4386,17 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
 
 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
                               SmallVectorImpl<unsigned> &CurrentOrder) const {
-  Instruction *E0 = cast<Instruction>(OpValue);
-  assert(E0->getOpcode() == Instruction::ExtractElement ||
-         E0->getOpcode() == Instruction::ExtractValue);
-  assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
+  const auto *It = find_if(VL, [](Value *V) {
+    return isa<ExtractElementInst, ExtractValueInst>(V);
+  });
+  assert(It != VL.end() && "Expected at least one extract instruction.");
+  auto *E0 = cast<Instruction>(*It);
+  assert(all_of(VL,
+                [](Value *V) {
+                  return isa<UndefValue, ExtractElementInst, ExtractValueInst>(
+                      V);
+                }) &&
+         "Invalid opcode");
   // Check if all of the extracts come from the same vector and from the
   // correct offset.
   Value *Vec = E0->getOperand(0);
@@ -4255,23 +4429,28 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
   // Also, later we can check that all the indices are used and we have a
   // consecutive access in the extract instructions, by checking that no
   // element of CurrentOrder still has value E + 1.
-  CurrentOrder.assign(E, E + 1);
+  CurrentOrder.assign(E, E);
   unsigned I = 0;
   for (; I < E; ++I) {
-    auto *Inst = cast<Instruction>(VL[I]);
+    auto *Inst = dyn_cast<Instruction>(VL[I]);
+    if (!Inst)
+      continue;
     if (Inst->getOperand(0) != Vec)
       break;
+    if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
+      if (isa<UndefValue>(EE->getIndexOperand()))
+        continue;
     Optional<unsigned> Idx = getExtractIndex(Inst);
     if (!Idx)
       break;
     const unsigned ExtIdx = *Idx;
     if (ExtIdx != I) {
-      if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
+      if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)
         break;
       ShouldKeepOrder = false;
       CurrentOrder[ExtIdx] = I;
     } else {
-      if (CurrentOrder[I] != E + 1)
+      if (CurrentOrder[I] != E)
         break;
       CurrentOrder[I] = I;
     }
@@ -4287,8 +4466,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
 bool BoUpSLP::areAllUsersVectorized(Instruction *I,
                                     ArrayRef<Value *> VectorizedVals) const {
   return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
-         llvm::all_of(I->users(), [this](User *U) {
-           return ScalarToTreeEntry.count(U) > 0;
+         all_of(I->users(), [this](User *U) {
+           return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U);
          });
 }
 
@@ -4348,6 +4527,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
   for (auto *V : VL) {
     ++Idx;
 
+    // Need to exclude undefs from analysis.
+    if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
+      continue;
+
     // Reached the start of a new vector registers.
     if (Idx % EltsPerVector == 0) {
       AllConsecutive = true;
@@ -4357,9 +4540,11 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
     // Check all extracts for a vector register on the target directly
     // extract values in order.
     unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
-    unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
-    AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
-                      CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+    if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {
+      unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
+      AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
+                        CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+    }
 
     if (AllConsecutive)
       continue;
@@ -4442,9 +4627,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
   // FIXME: it tries to fix a problem with MSVC buildbots.
   TargetTransformInfo &TTIRef = *TTI;
   auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
-                               VectorizedVals](InstructionCost &Cost,
-                                               bool IsGather) {
+                               VectorizedVals, E](InstructionCost &Cost) {
     DenseMap<Value *, int> ExtractVectorsTys;
+    SmallPtrSet<Value *, 4> CheckedExtracts;
     for (auto *V : VL) {
       if (isa<UndefValue>(V))
         continue;
@@ -4452,7 +4637,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       // instruction itself is not going to be vectorized, consider this
       // instruction as dead and remove its cost from the final cost of the
       // vectorized tree.
-      if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals))
+      // Also, avoid adjusting the cost for extractelements with multiple uses
+      // in different graph entries.
+      const TreeEntry *VE = getTreeEntry(V);
+      if (!CheckedExtracts.insert(V).second ||
+          !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
+          (VE && VE != E))
         continue;
       auto *EE = cast<ExtractElementInst>(V);
       Optional<unsigned> EEIdx = getExtractIndex(EE);
@@ -4549,11 +4739,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       }
       return GatherCost;
     }
-    if (isSplat(VL)) {
-      // Found the broadcasting of the single scalar, calculate the cost as the
-      // broadcast.
-      return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
-    }
     if ((E->getOpcode() == Instruction::ExtractElement ||
          all_of(E->Scalars,
                 [](Value *V) {
@@ -4571,13 +4756,20 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
         // single input vector or of 2 input vectors.
         InstructionCost Cost =
             computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
-        AdjustExtractsCost(Cost, /*IsGather=*/true);
+        AdjustExtractsCost(Cost);
         if (NeedToShuffleReuses)
           Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
                                       FinalVecTy, E->ReuseShuffleIndices);
         return Cost;
       }
     }
+    if (isSplat(VL)) {
+      // Found the broadcasting of the single scalar, calculate the cost as the
+      // broadcast.
+      assert(VecTy == FinalVecTy &&
+             "No reused scalars expected for broadcast.");
+      return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
+    }
     InstructionCost ReuseShuffleCost = 0;
     if (NeedToShuffleReuses)
       ReuseShuffleCost = TTI->getShuffleCost(
@@ -4755,7 +4947,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
               TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
         }
       } else {
-        AdjustExtractsCost(CommonCost, /*IsGather=*/false);
+        AdjustExtractsCost(CommonCost);
       }
       return CommonCost;
     }
@@ -5211,15 +5403,15 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
       FoundOr = true;
   }
   // Check if the input is an extended load of the required or/shift expression.
-  Value *LoadPtr;
+  Value *Load;
   if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
-      !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+      !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
     return false;
 
   // Require that the total load bit width is a legal integer type.
   // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
   // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
-  Type *SrcTy = LoadPtr->getType()->getPointerElementType();
+  Type *SrcTy = Load->getType();
   unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
   if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
     return false;
@@ -9061,8 +9253,7 @@ private:
            "A call to the llvm.fmuladd intrinsic is not handled yet");
 
     ++NumVectorInstructions;
-    return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
-                                       ReductionOps.back());
+    return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);
   }
 };
 
@@ -9473,6 +9664,59 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
   return Changed;
 }
 
+/// Compare two cmp instructions. If IsCompatibility is true, function returns
+/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
+/// operands. If IsCompatibility is false, function implements strict weak
+/// ordering relation between two cmp instructions, returning true if the first
+/// instruction is "less" than the second, i.e. its predicate is less than the
+/// predicate of the second or the operands IDs are less than the operands IDs
+/// of the second cmp instruction.
+template <bool IsCompatibility>
+static bool compareCmp(Value *V, Value *V2,
+                       function_ref<bool(Instruction *)> IsDeleted) {
+  auto *CI1 = cast<CmpInst>(V);
+  auto *CI2 = cast<CmpInst>(V2);
+  if (IsDeleted(CI2) || !isValidElementType(CI2->getType()))
+    return false;
+  if (CI1->getOperand(0)->getType()->getTypeID() <
+      CI2->getOperand(0)->getType()->getTypeID())
+    return !IsCompatibility;
+  if (CI1->getOperand(0)->getType()->getTypeID() >
+      CI2->getOperand(0)->getType()->getTypeID())
+    return false;
+  CmpInst::Predicate Pred1 = CI1->getPredicate();
+  CmpInst::Predicate Pred2 = CI2->getPredicate();
+  CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);
+  CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);
+  CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
+  CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
+  if (BasePred1 < BasePred2)
+    return !IsCompatibility;
+  if (BasePred1 > BasePred2)
+    return false;
+  // Compare operands.
+  bool LEPreds = Pred1 <= Pred2;
+  bool GEPreds = Pred1 >= Pred2;
+  for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
+    auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1);
+    auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1);
+    if (Op1->getValueID() < Op2->getValueID())
+      return !IsCompatibility;
+    if (Op1->getValueID() > Op2->getValueID())
+      return false;
+    if (auto *I1 = dyn_cast<Instruction>(Op1))
+      if (auto *I2 = dyn_cast<Instruction>(Op2)) {
+        if (I1->getParent() != I2->getParent())
+          return false;
+        InstructionsState S = getSameOpcode({I1, I2});
+        if (S.getOpcode())
+          continue;
+        return false;
+      }
+  }
+  return IsCompatibility;
+}
+
 bool SLPVectorizerPass::vectorizeSimpleInstructions(
     SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
     bool AtTerminator) {
@@ -9504,37 +9748,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
     }
     // Try to vectorize list of compares.
     // Sort by type, compare predicate, etc.
-    // TODO: Add analysis on the operand opcodes (profitable to vectorize
-    // instructions with same/alternate opcodes/const values).
     auto &&CompareSorter = [&R](Value *V, Value *V2) {
-      auto *CI1 = cast<CmpInst>(V);
-      auto *CI2 = cast<CmpInst>(V2);
-      if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
-        return false;
-      if (CI1->getOperand(0)->getType()->getTypeID() <
-          CI2->getOperand(0)->getType()->getTypeID())
-        return true;
-      if (CI1->getOperand(0)->getType()->getTypeID() >
-          CI2->getOperand(0)->getType()->getTypeID())
-        return false;
-      return CI1->getPredicate() < CI2->getPredicate() ||
-             (CI1->getPredicate() > CI2->getPredicate() &&
-              CI1->getPredicate() <
-                  CmpInst::getSwappedPredicate(CI2->getPredicate()));
+      return compareCmp<false>(V, V2,
+                               [&R](Instruction *I) { return R.isDeleted(I); });
     };
 
     auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {
       if (V1 == V2)
         return true;
-      auto *CI1 = cast<CmpInst>(V1);
-      auto *CI2 = cast<CmpInst>(V2);
-      if (R.isDeleted(CI2) || !isValidElementType(CI2->getType()))
-        return false;
-      if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType())
-        return false;
-      return CI1->getPredicate() == CI2->getPredicate() ||
-             CI1->getPredicate() ==
-                 CmpInst::getSwappedPredicate(CI2->getPredicate());
+      return compareCmp<true>(V1, V2,
+                              [&R](Instruction *I) { return R.isDeleted(I); });
     };
     auto Limit = [&R](Value *V) {
       unsigned EltSize = R.getVectorElementSize(V);
@@ -9592,10 +9815,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       return true;
     if (Opcodes1.size() > Opcodes2.size())
       return false;
+    Optional<bool> ConstOrder;
     for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
       // Undefs are compatible with any other value.
-      if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+      if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {
+        if (!ConstOrder)
+          ConstOrder =
+              !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]);
         continue;
+      }
       if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
         if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
           DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
@@ -9614,14 +9842,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
             continue;
           return I1->getOpcode() < I2->getOpcode();
         }
-      if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+      if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) {
+        if (!ConstOrder)
+          ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();
         continue;
+      }
       if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
         return true;
       if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
         return false;
     }
-    return false;
+    return ConstOrder && *ConstOrder;
   };
   auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
     if (V1 == V2)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 44b5e1df0839..1d9e71663cd2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -374,8 +374,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
   assert((SplitAt == end() || SplitAt->getParent() == this) &&
          "can only split at a position in the same block");
 
-  SmallVector<VPBlockBase *, 2> Succs(getSuccessors().begin(),
-                                      getSuccessors().end());
+  SmallVector<VPBlockBase *, 2> Succs(successors());
   // First, disconnect the current block from its successors.
   for (VPBlockBase *Succ : Succs)
     VPBlockUtils::disconnectBlocks(this, Succ);
@@ -642,6 +641,7 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
 void VPInstruction::generateInstruction(VPTransformState &State,
                                         unsigned Part) {
   IRBuilder<> &Builder = State.Builder;
+  Builder.SetCurrentDebugLocation(DL);
 
   if (Instruction::isBinaryOp(getOpcode())) {
     Value *A = State.get(getOperand(0), Part);
@@ -768,6 +768,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
     O << " ";
     Operand->printAsOperand(O, SlotTracker);
   }
+
+  if (DL) {
+    O << ", !dbg ";
+    DL.print(O);
+  }
 }
 #endif
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 810dd5030f95..f4a1883e35d5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -39,6 +39,7 @@
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/InstructionCost.h"
 #include <algorithm>
@@ -51,6 +52,7 @@ namespace llvm {
 
 class BasicBlock;
 class DominatorTree;
+class InductionDescriptor;
 class InnerLoopVectorizer;
 class LoopInfo;
 class raw_ostream;
@@ -500,6 +502,8 @@ public:
   const VPBlocksTy &getSuccessors() const { return Successors; }
   VPBlocksTy &getSuccessors() { return Successors; }
 
+  iterator_range<VPBlockBase **> successors() { return Successors; }
+
   const VPBlocksTy &getPredecessors() const { return Predecessors; }
   VPBlocksTy &getPredecessors() { return Predecessors; }
 
@@ -795,6 +799,7 @@ private:
   typedef unsigned char OpcodeTy;
   OpcodeTy Opcode;
   FastMathFlags FMF;
+  DebugLoc DL;
 
   /// Utility method serving execute(): generates a single instance of the
   /// modeled instruction.
@@ -804,12 +809,14 @@ protected:
   void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
 
 public:
-  VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
+  VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL)
       : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands),
-        VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}
+        VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode),
+        DL(DL) {}
 
-  VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
-      : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
+  VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
+                DebugLoc DL = {})
+      : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {}
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPValue *V) {
@@ -818,7 +825,7 @@ public:
 
   VPInstruction *clone() const {
     SmallVector<VPValue *, 2> Operands(operands());
-    return new VPInstruction(Opcode, Operands);
+    return new VPInstruction(Opcode, Operands, DL);
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1003,21 +1010,22 @@ public:
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their vector and scalar values.
-class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
   PHINode *IV;
+  const InductionDescriptor &IndDesc;
 
 public:
-  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, Instruction *Cast,
-                                TruncInst *Trunc = nullptr)
-      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV) {
-    if (Trunc)
-      new VPValue(Trunc, this);
-    else
-      new VPValue(IV, this);
+  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+                                const InductionDescriptor &IndDesc)
+      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this),
+        IV(IV), IndDesc(IndDesc) {}
+
+  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+                                const InductionDescriptor &IndDesc,
+                                TruncInst *Trunc)
+      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this),
+        IV(IV), IndDesc(IndDesc) {}
 
-    if (Cast)
-      new VPValue(Cast, this);
-  }
   ~VPWidenIntOrFpInductionRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1038,13 +1046,6 @@ public:
   /// Returns the start value of the induction.
   VPValue *getStartValue() { return getOperand(0); }
 
-  /// Returns the cast VPValue, if one is attached, or nullptr otherwise.
-  VPValue *getCastValue() {
-    if (getNumDefinedValues() != 2)
-      return nullptr;
-    return getVPValue(1);
-  }
-
   /// Returns the first defined value as TruncInst, if it is one or nullptr
   /// otherwise.
   TruncInst *getTruncInst() {
@@ -1053,6 +1054,9 @@ public:
   const TruncInst *getTruncInst() const {
     return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
   }
+
+  /// Returns the induction descriptor for the recipe.
+  const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
 };
 
 /// A recipe for handling first order recurrences and pointer inductions. For
@@ -1169,7 +1173,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe {
 /// operand.
 class VPReductionPHIRecipe : public VPWidenPHIRecipe {
   /// Descriptor for the reduction.
-  RecurrenceDescriptor &RdxDesc;
+  const RecurrenceDescriptor &RdxDesc;
 
   /// The phi is part of an in-loop reduction.
   bool IsInLoop;
@@ -1180,7 +1184,7 @@ class VPReductionPHIRecipe : public VPWidenPHIRecipe {
 public:
   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
   /// RdxDesc.
-  VPReductionPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc,
+  VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
                        VPValue &Start, bool IsInLoop = false,
                        bool IsOrdered = false)
       : VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),
@@ -1210,7 +1214,9 @@ public:
              VPSlotTracker &SlotTracker) const override;
 #endif
 
-  RecurrenceDescriptor &getRecurrenceDescriptor() { return RdxDesc; }
+  const RecurrenceDescriptor &getRecurrenceDescriptor() const {
+    return RdxDesc;
+  }
 
   /// Returns true, if the phi is part of an ordered reduction.
   bool isOrdered() const { return IsOrdered; }
@@ -1340,13 +1346,13 @@ public:
 /// The Operands are {ChainOp, VecOp, [Condition]}.
 class VPReductionRecipe : public VPRecipeBase, public VPValue {
   /// The recurrence decriptor for the reduction in question.
-  RecurrenceDescriptor *RdxDesc;
+  const RecurrenceDescriptor *RdxDesc;
   /// Pointer to the TTI, needed to create the target reduction
   const TargetTransformInfo *TTI;
 
 public:
-  VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp,
-                    VPValue *VecOp, VPValue *CondOp,
+  VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I,
+                    VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
                     const TargetTransformInfo *TTI)
       : VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp, VecOp}),
         VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), TTI(TTI) {
@@ -2252,6 +2258,12 @@ public:
     return map_range(Operands, Fn);
   }
 
+  /// Returns true if \p VPV is uniform after vectorization.
+  bool isUniformAfterVectorization(VPValue *VPV) const {
+    auto RepR = dyn_cast_or_null<VPReplicateRecipe>(VPV->getDef());
+    return !VPV->getDef() || (RepR && RepR->isUniform());
+  }
+
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
@@ -2340,18 +2352,23 @@ public:
 
   /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
   /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
-  /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr
-  /// has more than one successor, its conditional bit is propagated to \p
-  /// NewBlock. \p NewBlock must have neither successors nor predecessors.
+  /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's
+  /// successors are moved from \p BlockPtr to \p NewBlock and \p BlockPtr's
+  /// conditional bit is propagated to \p NewBlock. \p NewBlock must have
+  /// neither successors nor predecessors.
   static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
     assert(NewBlock->getSuccessors().empty() &&
-           "Can't insert new block with successors.");
-    // TODO: move successors from BlockPtr to NewBlock when this functionality
-    // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr
-    // already has successors.
-    BlockPtr->setOneSuccessor(NewBlock);
-    NewBlock->setPredecessors({BlockPtr});
+           NewBlock->getPredecessors().empty() &&
+           "Can't insert new block with predecessors or successors.");
     NewBlock->setParent(BlockPtr->getParent());
+    SmallVector<VPBlockBase *> Succs(BlockPtr->successors());
+    for (VPBlockBase *Succ : Succs) {
+      disconnectBlocks(BlockPtr, Succ);
+      connectBlocks(NewBlock, Succ);
+    }
+    NewBlock->setCondBit(BlockPtr->getCondBit());
+    BlockPtr->setCondBit(nullptr);
+    connectBlocks(BlockPtr, NewBlock);
   }
 
   /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
@@ -2394,6 +2411,31 @@ public:
     To->removePredecessor(From);
   }
 
+  /// Try to merge \p Block into its single predecessor, if \p Block is a
+  /// VPBasicBlock and its predecessor has a single successor. Returns a pointer
+  /// to the predecessor \p Block was merged into or nullptr otherwise.
+  static VPBasicBlock *tryToMergeBlockIntoPredecessor(VPBlockBase *Block) {
+    auto *VPBB = dyn_cast<VPBasicBlock>(Block);
+    auto *PredVPBB =
+        dyn_cast_or_null<VPBasicBlock>(Block->getSinglePredecessor());
+    if (!VPBB || !PredVPBB || PredVPBB->getNumSuccessors() != 1)
+      return nullptr;
+
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB))
+      R.moveBefore(*PredVPBB, PredVPBB->end());
+    VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
+    auto *ParentRegion = cast<VPRegionBlock>(Block->getParent());
+    if (ParentRegion->getExit() == Block)
+      ParentRegion->setExit(PredVPBB);
+    SmallVector<VPBlockBase *> Successors(Block->successors());
+    for (auto *Succ : Successors) {
+      VPBlockUtils::disconnectBlocks(Block, Succ);
+      VPBlockUtils::connectBlocks(PredVPBB, Succ);
+    }
+    delete Block;
+    return PredVPBB;
+  }
+
   /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge.
   static bool isBackEdge(const VPBlockBase *FromBlock,
                          const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index ac3b3505dc34..86ecd6817873 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -50,14 +50,14 @@ VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB,
 
   case EdgeType::FALSE_EDGE:
     // CurrBB is the False successor of PredBB - compute not of CBV.
-    IntermediateVal = Builder.createNot(CBV);
+    IntermediateVal = Builder.createNot(CBV, {});
     break;
   }
 
   // Now AND intermediate value with PredBB's block predicate if it has one.
   VPValue *BP = PredBB->getPredicate();
   if (BP)
-    return Builder.createAnd(BP, IntermediateVal);
+    return Builder.createAnd(BP, IntermediateVal, {});
   else
     return IntermediateVal;
 }
@@ -96,7 +96,7 @@ VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) {
     Worklist.pop_front();
 
     // Create an OR of these values.
-    VPValue *Or = Builder.createOr(LHS, RHS);
+    VPValue *Or = Builder.createOr(LHS, RHS, {});
 
     // Push OR to the back of the worklist.
     Worklist.push_back(Or);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index c52c8a2229e8..9e19e172dea5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -467,8 +467,9 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
     return markFailed();
 
   assert(CombinedOperands.size() > 0 && "Need more some operands");
-  auto *VPI = new VPInstruction(Opcode, CombinedOperands);
-  VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
+  auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
+  auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
+  VPI->setUnderlyingInstr(Inst);
 
   LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
                     << *cast<VPInstruction>(Values[0]) << "\n");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ded5bc04beb5..d2daf558c2c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -18,7 +18,8 @@ using namespace llvm;
 
 void VPlanTransforms::VPInstructionsToVPRecipes(
     Loop *OrigLoop, VPlanPtr &Plan,
-    LoopVectorizationLegality::InductionList &Inductions,
+    function_ref<const InductionDescriptor *(PHINode *)>
+        GetIntOrFpInductionDescriptor,
     SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE) {
 
   auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
@@ -44,11 +45,9 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
       VPRecipeBase *NewRecipe = nullptr;
       if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) {
         auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
-        InductionDescriptor II = Inductions.lookup(Phi);
-        if (II.getKind() == InductionDescriptor::IK_IntInduction ||
-            II.getKind() == InductionDescriptor::IK_FpInduction) {
-          VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
-          NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr);
+        if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) {
+          VPValue *Start = Plan->getOrAddVPValue(II->getStartValue());
+          NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, *II);
         } else {
           Plan->addVPValue(Phi, VPPhi);
           continue;
@@ -158,8 +157,7 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {
       // TODO: add ".cloned" suffix to name of Clone's VPValue.
 
       Clone->insertBefore(SinkCandidate);
-      SmallVector<VPUser *, 4> Users(SinkCandidate->user_begin(),
-                                     SinkCandidate->user_end());
+      SmallVector<VPUser *, 4> Users(SinkCandidate->users());
       for (auto *U : Users) {
         auto *UI = cast<VPRecipeBase>(U);
         if (UI->getParent() == SinkTo)
@@ -266,8 +264,7 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
       VPValue *PredInst1 =
           cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
       VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
-      SmallVector<VPUser *> Users(Phi1ToMoveV->user_begin(),
-                                  Phi1ToMoveV->user_end());
+      SmallVector<VPUser *> Users(Phi1ToMoveV->users());
       for (VPUser *U : Users) {
         auto *UI = dyn_cast<VPRecipeBase>(U);
         if (!UI || UI->getParent() != Then2)
@@ -295,3 +292,35 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
     delete ToDelete;
   return Changed;
 }
+
+void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
+  SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove;
+  for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) {
+    auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+    if (!IV || IV->getTruncInst())
+      continue;
+
+    // Visit all casts connected to IV and in Casts. Collect them.
+    // remember them for removal.
+    auto &Casts = IV->getInductionDescriptor().getCastInsts();
+    VPValue *FindMyCast = IV;
+    for (Instruction *IRCast : reverse(Casts)) {
+      VPRecipeBase *FoundUserCast = nullptr;
+      for (auto *U : FindMyCast->users()) {
+        auto *UserCast = cast<VPRecipeBase>(U);
+        if (UserCast->getNumDefinedValues() == 1 &&
+            UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) {
+          FoundUserCast = UserCast;
+          break;
+        }
+      }
+      assert(FoundUserCast && "Missing a cast to remove");
+      CastsToRemove.emplace_back(FoundUserCast, IV);
+      FindMyCast = FoundUserCast->getVPSingleValue();
+    }
+  }
+  for (auto &E : CastsToRemove) {
+    E.first->getVPSingleValue()->replaceAllUsesWith(E.second);
+    E.first->eraseFromParent();
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index c740f2c022da..a82a562d5e35 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -14,24 +14,37 @@
 #define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
 
 #include "VPlan.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 
 namespace llvm {
 
+class InductionDescriptor;
 class Instruction;
+class PHINode;
 class ScalarEvolution;
 
 struct VPlanTransforms {
   /// Replaces the VPInstructions in \p Plan with corresponding
   /// widen recipes.
-  static void VPInstructionsToVPRecipes(
-      Loop *OrigLoop, VPlanPtr &Plan,
-      LoopVectorizationLegality::InductionList &Inductions,
-      SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE);
+  static void
+  VPInstructionsToVPRecipes(Loop *OrigLoop, VPlanPtr &Plan,
+                            function_ref<const InductionDescriptor *(PHINode *)>
+                                GetIntOrFpInductionDescriptor,
+                            SmallPtrSetImpl<Instruction *> &DeadInstructions,
+                            ScalarEvolution &SE);
 
   static bool sinkScalarOperands(VPlan &Plan);
 
   static bool mergeReplicateRegions(VPlan &Plan);
+
+  /// Remove redundant casts of inductions.
+  ///
+  /// Such redundant casts are casts of induction variables that can be ignored,
+  /// because we already proved that the casted phi is equal to the uncasted phi
+  /// in the vectorized loop. There is no need to vectorize the cast - the same
+  /// value can be used for both the phi and casts in the vector loop.
+  static void removeRedundantInductionCasts(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 6d6ea4eb30f1..7732d9367985 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -156,5 +156,31 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
       RecipeI++;
     }
   }
+
+  const VPRegionBlock *TopRegion = cast<VPRegionBlock>(Plan.getEntry());
+  const VPBasicBlock *Entry = dyn_cast<VPBasicBlock>(TopRegion->getEntry());
+  if (!Entry) {
+    errs() << "VPlan entry block is not a VPBasicBlock\n";
+    return false;
+  }
+  const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit());
+  if (!Exit) {
+    errs() << "VPlan exit block is not a VPBasicBlock\n";
+    return false;
+  }
+
+  for (const VPRegionBlock *Region :
+       VPBlockUtils::blocksOnly<const VPRegionBlock>(
+           depth_first(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(
+               Plan.getEntry())))) {
+    if (Region->getEntry()->getNumPredecessors() != 0) {
+      errs() << "region entry block has predecessors\n";
+      return false;
+    }
+    if (Region->getExit()->getNumSuccessors() != 0) {
+      errs() << "region exit block has successors\n";
+      return false;
+    }
+  }
   return true;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 57b11e9414ba..c0aedab2fed0 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -989,9 +989,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   if (!FixedVT)
     return false;
 
-  InstructionCost OriginalCost = TTI.getMemoryOpCost(
-      Instruction::Load, LI->getType(), Align(LI->getAlignment()),
-      LI->getPointerAddressSpace());
+  InstructionCost OriginalCost =
+      TTI.getMemoryOpCost(Instruction::Load, LI->getType(), LI->getAlign(),
+                          LI->getPointerAddressSpace());
   InstructionCost ScalarizedCost = 0;
 
   Instruction *LastCheckedInst = LI;